Spaces:

Mike0021
/

moebius

Running on Zero

App Files Files Community

Mike0021 commited on 11 days ago

Commit

166ab04

verified ·

1 Parent(s): 7874d4a

Implement Moebius Gradio Space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +18 -0
.gitignore +5 -0
LICENSE +199 -0
README.md +195 -8
app.py +229 -0
assets/logo_dynamic_woWaterMark.gif +3 -0
assets/pipeline.png +3 -0
assets/qualitative_comparison.png +3 -0
assets/sup_showcase_celebahq_ffhq.png +3 -0
assets/sup_showcase_places_v2.png +3 -0
assets/tab1.png +3 -0
assets/tab1_woTitle.png +3 -0
assets/tab2.png +3 -0
assets/tab3.png +3 -0
assets/tab4.png +3 -0
config/data_demo.yaml +9 -0
config/model_cfg/moebius.yaml +47 -0
config/model_cfg/pixelhacker.yaml +24 -0
config/rand_mask_cfg/random_medium_256.yaml +33 -0
config/rand_mask_cfg/random_medium_512.yaml +32 -0
config/rand_mask_cfg/random_thick_256.yaml +33 -0
config/rand_mask_cfg/random_thick_512.yaml +33 -0
config/rand_mask_cfg/random_thin_256.yaml +25 -0
config/rand_mask_cfg/random_thin_512.yaml +25 -0
config/train_demo.sh +45 -0
data/images/0.png +3 -0
data/images/1.png +3 -0
data/images/10.png +3 -0
data/images/100.png +3 -0
data/images/10000.png +3 -0
data/images/10001.png +3 -0
data/images/10002.png +3 -0
data/images/10003.png +3 -0
data/masks/000000.png +0 -0
data/masks/000001.png +0 -0
data/masks/000002.png +0 -0
data/masks/000003.png +0 -0
data/masks/000004.png +0 -0
data/masks/000005.png +0 -0
data/masks/000006.png +0 -0
data/masks/000007.png +0 -0
data/train_data.jsonl +8 -0
infer/__init__.py +0 -0
infer/infer_moebius.py +45 -0
infer/utils.py +123 -0
infer/utils_dataset.py +211 -0
library/__init__.py +0 -0
library/chinese_sdxl_train_util.py +350 -0
library/custom_train_functions.py +515 -0
library/train_util.py +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,21 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/logo_dynamic_woWaterMark.gif filter=lfs diff=lfs merge=lfs -text
+assets/pipeline.png filter=lfs diff=lfs merge=lfs -text
+assets/qualitative_comparison.png filter=lfs diff=lfs merge=lfs -text
+assets/sup_showcase_celebahq_ffhq.png filter=lfs diff=lfs merge=lfs -text
+assets/sup_showcase_places_v2.png filter=lfs diff=lfs merge=lfs -text
+assets/tab1.png filter=lfs diff=lfs merge=lfs -text
+assets/tab1_woTitle.png filter=lfs diff=lfs merge=lfs -text
+assets/tab2.png filter=lfs diff=lfs merge=lfs -text
+assets/tab3.png filter=lfs diff=lfs merge=lfs -text
+assets/tab4.png filter=lfs diff=lfs merge=lfs -text
+data/images/0.png filter=lfs diff=lfs merge=lfs -text
+data/images/1.png filter=lfs diff=lfs merge=lfs -text
+data/images/10.png filter=lfs diff=lfs merge=lfs -text
+data/images/100.png filter=lfs diff=lfs merge=lfs -text
+data/images/10000.png filter=lfs diff=lfs merge=lfs -text
+data/images/10001.png filter=lfs diff=lfs merge=lfs -text
+data/images/10002.png filter=lfs diff=lfs merge=lfs -text
+data/images/10003.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+*.pyc
+.venv/
+outputs/
+weight/

LICENSE ADDED Viewed

	@@ -0,0 +1,199 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to the Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by the Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding any notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. Please also get an
+      "Alarm or alarm" page at http://www.apache.org/
+   Copyright 2024 Moebius Authors
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,13 +1,200 @@
 ---
-title: Moebius
-emoji: 📊
-colorFrom: green
-colorTo: gray
 sdk: gradio
-sdk_version: 6.19.0
-python_version: '3.13'
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Moebius Inpainting
+emoji: 🖌️
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: 6.10.0
 app_file: app.py
+short_description: Lightweight Moebius image inpainting
+startup_duration_timeout: 1h
+python_version: 3.11
 ---
+<div align="center">
+    <img src="./assets/logo_dynamic_woWaterMark.gif" width="100%"></img>
+</div>
+<div align="center">
+<h2>Moebius: 0.2B Lightweight Image Inpainting Framework with 10B-Level Performance</h2>
+***On-par-with/surpass 10B-level industrial SOTA generalist (FLUX.1-Fill-Dev) on 6 benchmarks across natural and portrait scenes & Only 2% (0.2B) parameters, and inference 15× faster***
+[Kangsheng Duan](https://github.com/AnduinD)<sup>1,</sup>\*, [Ziyang Xu](https://ziyangxu.top)<sup>1,</sup>\*<sup>,&dagger;</sup>, [Wenyu Liu](http://eic.hust.edu.cn/professor/liuwenyu)<sup>1</sup>, Xiaohu Ruan<sup>2</sup>, [Xiaoxin Chen](https://scholar.google.com/citations?hl=zh-CN&user=SI_oBwsAAAAJ)<sup>2</sup>, [Xinggang Wang](https://xwcv.github.io)<sup>1, :email:</sup>
+(*) Equal Contribution, (<sup>&dagger;</sup>) Project Leader, (<sup>:email:</sup>) Corresponding Author.
+<sup>1</sup> Huazhong University of Science and Technology. <sup>2</sup> VIVO AI Lab.
+[![arxiv](https://img.shields.io/badge/ECCV'26-paper-orange)](https://arxiv.org/abs/2606.19195)  [![license](https://img.shields.io/badge/License-Apache_2.0-blue)](LICENSE)  [![Project](https://img.shields.io/badge/Project_Page-https://hustvl.github.io/Moebius-purple)](https://hustvl.github.io/Moebius)  [![HF Daily Rank](https://img.shields.io/badge/Hugging%20Face-No.%201%20Daily%20Ranking-ffbd00)](https://huggingface.co/papers/date/2026-06-19)
+<br>
+<img src="./assets/pipeline.png" style="margin-bottom: 10px;"></img>
+<img src="./assets/tab1_woTitle.png"></img>
+<img src="./assets/qualitative_comparison.png"></img>
+</div>
+## 🐱‍🏍 Insight & Small Talk
+> ***Moebius*** *is our latest AI Image Inpainting endeavor, serving as a direct continuation of our previous work, **[PixelHacker](https://github.com/hustvl/PixelHacker)**. Named after the concepts of "infinity" and "master painter," Moebius embodies our vision: maintaining exceptional generation quality under highly constrained computational resources while pushing the efficiency of image inpainting to its limits as much as possible.*
+>
+> *Under the iron grip of the Scaling Law, AI research has long devolved into a grueling arms race of burning capital, compute, and data. Consequently, the academic community finds it increasingly difficult to keep pace with the ever-expanding model scales driven by the tech industry.*
+>
+> <p align="center"><b>"<ins>But is this brute-force scaling truly the only path forward?</ins>"</b></p>
+>
+> *Using general-purpose image inpainting as our strategic entry point, we challenge the "scale-at-all-costs" path dependency dictated by the Scaling Law narrative. Through the synergistic optimization of architectural design and knowledge distillation, Moebius achieves a remarkably compact footprint of just **0.22B parameters**. It liberates high-quality image inpainting from the heavy-compute narrative of 10B+ foundation models:*
+> *Across six comprehensive benchmarks spanning both natural and portrait scenes, Moebius performs **on par with**, and in certain scenarios **surpasses**, the inpainting quality of 10B+ industrial state-of-the-art (SOTA) generalist models like *FLUX.1-Fill-Dev*, while delivering a massive **>15× inference acceleration**.*
+>
+> 💡 **The core insight of Moebius can be summarized in a single equation:**
+>
+> $$\begin{aligned}
+> \text{Synergy} \times (\text{Architecture} + \text{Distillation}) = & \text{Shattering the "Impossible Triangle" of} \\
+> & \text{Low Parameters, Fast Inference, and High Quality}
+> \end{aligned}$$
+>
+> --- *written on June 16, 2026* ---
+## 🌟 Highlights
+- **📉 Extreme Parametric Efficiency (< 2%)**: Moebius operates with a mere **0.22B (226M) parameters**, which represents **less than 2%** of the size of the colossal industrial giant *FLUX.1-Fill-Dev (11.9B)*. It shatters the heavy-compute narrative, making high-quality inpainting accessible on consumer-grade and edge devices.
+- **⚡ 15× Inference Speedup (26ms/step)**: Achieves a blistering inference latency of only **26.01 ms per step** on a single GPU. Combined with optimized sampling steps, Moebius delivers an overall **>15× total runtime acceleration** compared to 10B-level models.
+- **🏆 10B-Level Inpainting Quality (on-par-with/surpass FLUX.1-Fill-Dev across 6 benchmarks)**: Size contraction does not mean representation degradation. Through the synergistic optimization of architecture and distillation, Moebius performs on par with, and in certain scenarios (such as complex textures and facial plausibility), surpasses 10B-level state-of-the-art (SOTA) generalist models (*FLUX.1-Fill-Dev, SD3.5 Large-Inpainting*) across 6 comprehensive benchmarks spanning **both natural** scenes (*Places2*) and **portrait** scenes (*CelebA-HQ*, *FFHQ*).
+- **💡 Synergistic Core Innovations**:
+  - **Architecture Design (LλMI Block)**: Reformulates both self- and cross-attention by condensing spatial context and global semantic priors into fixed-size linear matrices, bypassing quadratic computational overhead.
+  - **Adaptive Multi-Granularity Distillation Strategy**: Transfers the representational capacity from our *[PixelHacker](https://github.com/hustvl/PixelHacker)* (teacher) strictly within the latent space (avoiding expensive pixel-space decoding). It bridges the giant capacity gap by aligning multi-granularity supervision—ranging from microscopic intermediate features to macroscopic diffusion trajectories—while dynamically balancing training via a gradient norm adaptive loss weighting mechanism.
+  - **Optimal Synergistic Balancing**: Systematically explores the mutual constraint and upper bound between compact structure and distillation. By mapping this architecture-distillation synergy frontier, we ensure our 0.22B *Moebius* (student) absorbs the maximum semantic reasoning of *[PixelHacker](https://github.com/hustvl/PixelHacker)* (teacher) without triggering representation saturation.
+<div align="center">
+    <img src="./assets/tab2.png" width="70%" style="margin-bottom: 10px;"></img>
+</div>
+- **🚀 Task-Specific Specialist over Bloated Generalists**: Rather than blindly scaling up, Moebius answers a fundamental question: *<ins>Can a model be smarter, lighter, and faster when the task is explicitly defined?</ins>* It serves as a highly optimized specialist that liberates real-world image inpainting and AI object removal from parameter bloat.
+## 🔥 News
+* **`June 19, 2026`:** 🎉 Moebius has achieved the [No. 1 daily ranking](https://huggingface.co/papers/date/2026-06-19) on Hugging Face!
+* **`June 18, 2026`:** 🔥🔥 We have released the training and inference code, and open-sourced the [model weights](https://huggingface.co/hustvl/Moebius) on Hugging Face.
+* **`June 18, 2026`:** 🎉 Moebius is accepted by ECCV'26! We have released the preprint on arXiv, check it [here](https://arxiv.org/abs/2606.19195) ~ 🍻
+* **`June 16, 2026`:** 🔥 We have submitted the GitHub repo for the first time, and there will be more updates soon. Stay tuned! 🤗
+## 🏕️ Performance on Natural Scene
+<div align="center">
+<img src="./assets/tab3.png"></img>
+<img src="./assets/sup_showcase_places_v2.png"></img>
+</div>
+## 🤗 Performance on Portrait Scene
+<div align="center">
+<img src="./assets/tab4.png" width="50%"></img>
+<img src="./assets/sup_showcase_celebahq_ffhq.png"></img>
+</div>
+## ⚖️ Evaluation Resources
+The masks of the evaluation set are shared in [Google Drive](https://drive.google.com/drive/folders/13J91fdQt2RnHp4j-VzdtSrHRHPA1OxJ5?usp=sharing), and the corresponding images can be downloaded from the following open source platforms:
+* Places2: [Places2](http://places2.csail.mit.edu/download-private.html)
+* CelebA-HQ: [CelebA-HQ](https://openxlab.org.cn/datasets/OpenDataLab/CelebA-HQ)
+* FFHQ: [FFHQ](https://drive.google.com/drive/folders/1tZUcXDBeOibC6jcMCtgRRz67pzrAHeHL?usp=drive_link)
+## 📦 Environment Setups
+* torch=2.7.1
+* diffusers=0.38.0
+* transformers=4.56.2
+* flash-linear-attention=0.3.2
+* See 'requirements.txt' for detailed Python libraries required
+```bash
+conda create -n moebius python=3.14.4
+conda activate moebius
+# cd /xx/xx/Moebius
+pip install -r requirements.txt
+```
+## 🗃️ Model Checkpoints
+* Download the checkpoint of [VAE](https://huggingface.co/hustvl/PixelHacker/tree/main/vae) and put it into ./weight/vae.
+* Download the checkpoints of [pretrained version](https://huggingface.co/hustvl/Moebius/tree/main/pretrained), [fine-tuned version (places2)](https://huggingface.co/hustvl/Moebius/tree/main/ft_places2), [fine-tuned version (celeba-hq)](https://huggingface.co/hustvl/Moebius/tree/main/ft_celebahq), [fine-tuned version (ffhq)](https://huggingface.co/hustvl/Moebius/tree/main/ft_ffhq), and put them into ./weight/Moebius.
+* Finally, the detailed organizational form is as follows:
+```bash
+├── weight
+|   ├── Moebius
+|        ├── pretrained
+|            ├── diffusion_pytorch_model.bin
+|        ├── ft_places2
+|            ├── diffusion_pytorch_model.bin
+|        ├── ft_celebahq
+|            ├── diffusion_pytorch_model.bin
+|        ├── ft_ffhq
+|            ├── diffusion_pytorch_model.bin
+|    ├── vae
+|        ├── config.json
+|        ├── diffusion_pytorch_model.bin
+├── ...
+```
+<!-- * teacher model and vae: [hustvl/PixelHacker](https://huggingface.co/hustvl/PixelHacker)
+* student model: [hustvl/Moebius](https://huggingface.co/hustvl/Moebius) -->
+## 🚂 Training
+You can run the following code to start training. The training script supports distributed training, and you can configure the GPU count via environment variables.
+```bash
+# For single GPU training:
+PY_TRAINER=train_distillation.py bash run/run_ddp_1node.sh config/train_demo.sh
+# For multi GPU training:
+NUM_GPUS_PER_MACHINE=4 bash run/run_ddp_1node.sh config/train_demo.sh
+```
+## 🔮 Inference
+You can run the following code directly to get the inpainting result of the example image-mask pair, and the result will be generated in ./outputs. If you want to infer on custom data, just place the image and mask with the same name in ./dataset.local/imgs and ./dataset.local/masks, respectively, then run the following code as well.
+```bash
+python -m infer.infer_moebius \
+    --model-config config/model_cfg/moebius.yaml \
+    --model-weight weight/Moebius/ft_celebahq/diffusion_pytorch_model.bin \
+    --real-dir data/images \
+    --mask-dir data/masks \
+    --save-dir ./outputs \
+    --cfg 2.0 \
+    --batch-size 8 \
+    --num-workers 8
+```
+## 🎓 Citation
+```shell
+@misc{DuanAndXu2026Moebius,
+      title={Moebius: 0.2B Lightweight Image Inpainting Framework with 10B-Level Performance},
+      author={Kangsheng Duan and Ziyang Xu and Wenyu Liu and Xiaohu Ruan and Xiaoxin Chen and Xinggang Wang},
+      year={2026},
+      eprint={2606.19195},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2606.19195},
+}
+```
+## 🧑‍🤝‍🧑 Acknowledgement
+We sincerely thank the authors of the following open-source repositories for their contributions to the community, which have greatly facilitated our research and development of Moebius: [Sana](https://github.com/NVlabs/Sana), [flash-linear-attention](https://github.com/fla-org/flash-linear-attention), [lambda-networks](https://github.com/lucidrains/lambda-networks), [timm](https://github.com/huggingface/pytorch-image-models), [Muon](https://github.com/KellerJordan/Muon), [diffusers](https://github.com/huggingface/diffusers).

app.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import os
+_hf_cache = "/data/.cache/huggingface" if os.path.isdir("/data") and os.access("/data", os.W_OK) else "/tmp/hf_home"
+os.environ.setdefault("HF_HOME", _hf_cache)
+os.environ.setdefault("HF_MODULES_CACHE", "/tmp/hf_modules")
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/matplotlib")
+os.environ.setdefault("GRADIO_SSR_MODE", "false")
+import time
+from pathlib import Path
+from typing import Dict, Tuple
+import spaces
+import gradio as gr
+import torch
+from diffusers import DDIMScheduler
+from diffusers.models import AutoencoderKL
+from huggingface_hub import hf_hub_download, snapshot_download
+from PIL import Image
+from removal.v1_2 import build_removal_model, load_cfg, load_removal_model
+from removal.v1_2.pipeline import RemovalSDXLPipeline_BatchMode
+ROOT = Path(__file__).resolve().parent
+CONFIG_PATH = ROOT / "config" / "model_cfg" / "moebius.yaml"
+MOEBIUS_REPO = "hustvl/Moebius"
+PIXELHACKER_REPO = "hustvl/PixelHacker"
+DEFAULT_MODEL_KEY = "ft_places2"
+MODEL_CHOICES = {
+    "General scenes (Places2)": "ft_places2",
+    "Portraits (CelebA-HQ)": "ft_celebahq",
+    "Faces (FFHQ)": "ft_ffhq",
+    "Pretrained": "pretrained",
+}
+_PIPELINE_CACHE: Dict[str, RemovalSDXLPipeline_BatchMode] = {}
+def _download_vae_dir() -> str:
+    repo_dir = snapshot_download(
+        repo_id=PIXELHACKER_REPO,
+        allow_patterns=["vae/*"],
+    )
+    return str(Path(repo_dir) / "vae")
+def _download_model_weight(model_key: str) -> str:
+    return hf_hub_download(
+        repo_id=MOEBIUS_REPO,
+        filename=f"{model_key}/diffusion_pytorch_model.bin",
+    )
+def _build_cpu_pipeline(model_key: str) -> RemovalSDXLPipeline_BatchMode:
+    model_cfg = load_cfg(str(CONFIG_PATH))
+    model_cfg["vae"]["model_dir"] = _download_vae_dir()
+    removal_model = build_removal_model(model_cfg, 20)
+    weight_path = _download_model_weight(model_key)
+    print(load_removal_model(removal_model, weight_path, device="cpu"))
+    vae = AutoencoderKL.from_pretrained(model_cfg["vae"]["model_dir"])
+    scheduler = DDIMScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+        clip_sample=False,
+    )
+    return RemovalSDXLPipeline_BatchMode(
+        removal_model=removal_model,
+        vae=vae,
+        scheduler=scheduler,
+        device="cpu",
+        dtype=torch.float32,
+    )
+def _get_pipeline(model_key: str) -> RemovalSDXLPipeline_BatchMode:
+    if model_key not in _PIPELINE_CACHE:
+        _PIPELINE_CACHE[model_key] = _build_cpu_pipeline(model_key)
+    return _PIPELINE_CACHE[model_key]
+def _set_pipeline_device(pipe: RemovalSDXLPipeline_BatchMode, device: str) -> None:
+    pipe.device = device
+    pipe.vae.to(device=device, dtype=pipe.dtype).eval()
+    pipe.removal_model.to(device=device, dtype=pipe.dtype).eval()
+    half_id_num = pipe.removal_model.num_embeddings // 2
+    id_num = pipe.removal_model.num_embeddings
+    input_ids = torch.tensor([list(range(half_id_num))], dtype=torch.int64, device=device, requires_grad=False)
+    un_input_ids = torch.tensor([list(range(half_id_num, id_num))], dtype=torch.int64, device=device, requires_grad=False)
+    pipe.input_ids = torch.cat([un_input_ids, input_ids]).to(device=device)
+def _normalize_inputs(image: Image.Image, mask: Image.Image) -> Tuple[Image.Image, Image.Image]:
+    if image is None:
+        raise gr.Error("Upload an image.")
+    if mask is None:
+        raise gr.Error("Upload a mask.")
+    image = image.convert("RGB")
+    mask = mask.convert("L").resize(image.size, Image.Resampling.NEAREST)
+    mask_min, mask_max = mask.getextrema()
+    if mask_max < 8:
+        raise gr.Error("The mask is empty. Use white pixels for the area to inpaint.")
+    if mask_min > 247:
+        raise gr.Error("The mask covers the whole image. Leave black pixels outside the edit area.")
+    return image, mask
+def _model_key(label: str) -> str:
+    return MODEL_CHOICES.get(label, DEFAULT_MODEL_KEY)
+def _estimate_duration(image, mask, model_name, steps, guidance_scale, paste, compensate, seed, *args, **kwargs):
+    del image, mask, model_name, guidance_scale, paste, compensate, seed, args, kwargs
+    return min(240, 90 + int(steps) * 5)
+_get_pipeline(DEFAULT_MODEL_KEY)
+@spaces.GPU(duration=1)
+def _zerogpu_probe():
+    return "ready"
+@spaces.GPU(duration=_estimate_duration)
+def run_inpaint(image, mask, model_name, steps, guidance_scale, paste, compensate, seed):
+    image, mask = _normalize_inputs(image, mask)
+    model_key = _model_key(model_name)
+    seed_value = 0 if seed is None else int(seed)
+    pipe = _get_pipeline(model_key)
+    started = time.perf_counter()
+    try:
+        _set_pipeline_device(pipe, "cuda")
+        with torch.inference_mode():
+            outputs = pipe(
+                [image],
+                [mask],
+                image_size=512,
+                num_steps=int(steps),
+                guidance_scale=float(guidance_scale),
+                paste=bool(paste),
+                compensate=bool(compensate),
+                noise_offset=0.0357,
+                retry=seed_value,
+                mute=True,
+            )
+        elapsed = time.perf_counter() - started
+        return outputs[0], f"Completed in {elapsed:.1f}s"
+    finally:
+        _set_pipeline_device(pipe, "cpu")
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+with gr.Blocks(title="Moebius Inpainting", fill_width=True) as demo:
+    gr.Markdown("# Moebius Inpainting")
+    with gr.Row():
+        with gr.Column(scale=1, min_width=320):
+            input_image = gr.Image(
+                label="Image",
+                type="pil",
+                image_mode="RGB",
+                sources=["upload", "clipboard"],
+                height=360,
+            )
+            input_mask = gr.Image(
+                label="Mask",
+                type="pil",
+                image_mode="L",
+                sources=["upload", "clipboard"],
+                height=360,
+            )
+        with gr.Column(scale=1, min_width=320):
+            output_image = gr.Image(label="Result", type="pil", height=520)
+            status = gr.Markdown()
+    with gr.Row():
+        model_name = gr.Dropdown(
+            label="Checkpoint",
+            choices=list(MODEL_CHOICES.keys()),
+            value="General scenes (Places2)",
+            min_width=240,
+        )
+        steps = gr.Slider(4, 30, value=20, step=1, label="Steps", min_width=180)
+        guidance_scale = gr.Slider(1.0, 6.0, value=2.0, step=0.1, label="CFG", min_width=180)
+        seed = gr.Number(value=0, precision=0, label="Seed", min_width=140)
+    with gr.Row():
+        paste = gr.Checkbox(value=True, label="Paste")
+        compensate = gr.Checkbox(value=False, label="Compensate")
+        run_button = gr.Button("Inpaint", variant="primary")
+    run_button.click(
+        fn=run_inpaint,
+        inputs=[input_image, input_mask, model_name, steps, guidance_scale, paste, compensate, seed],
+        outputs=[output_image, status],
+        api_name="inpaint",
+        concurrency_limit=1,
+    )
+    gr.Examples(
+        examples=[
+            ["data/images/0.png", "data/masks/000000.png", "General scenes (Places2)", 20, 2.0, True, False, 0],
+            ["data/images/1.png", "data/masks/000001.png", "General scenes (Places2)", 20, 2.0, True, False, 1],
+        ],
+        inputs=[input_image, input_mask, model_name, steps, guidance_scale, paste, compensate, seed],
+        outputs=[output_image, status],
+        fn=run_inpaint,
+        cache_examples=True,
+        cache_mode="lazy",
+    )
+demo.queue(max_size=8, default_concurrency_limit=1)
+if __name__ == "__main__":
+    demo.launch()

assets/logo_dynamic_woWaterMark.gif ADDED Viewed

Git LFS Details

SHA256: 50c044d0b741ab057dd3debc74aca6bcf68752e5dc9f6c238e28ff44cd828dd0
Pointer size: 133 Bytes
Size of remote file: 22.4 MB

assets/pipeline.png ADDED Viewed

Git LFS Details

SHA256: 062dadb63eac08f76e02c53d5f2fda6bed9281e405b890f8fc25890510af0e23
Pointer size: 131 Bytes
Size of remote file: 624 kB

assets/qualitative_comparison.png ADDED Viewed

Git LFS Details

SHA256: c89293e6c515a7625e0c90e0f2587c94acd88336fbc8cf2017bbe414e425b93d
Pointer size: 132 Bytes
Size of remote file: 5.07 MB

assets/sup_showcase_celebahq_ffhq.png ADDED Viewed

Git LFS Details

SHA256: 0c4c7a8775cedafd26308fb9c6d38606f63dc822e89d00aa14fc45bdf8b707c3
Pointer size: 132 Bytes
Size of remote file: 9.75 MB

assets/sup_showcase_places_v2.png ADDED Viewed

Git LFS Details

SHA256: 1672c61e662e87eb8cc37bd6a858959d2a544c3cc74c9cd088f390c88010f2d8
Pointer size: 133 Bytes
Size of remote file: 11.4 MB

assets/tab1.png ADDED Viewed

Git LFS Details

SHA256: 10965aef428ca1f1b942503a09a9c4b1959464c3ccde5dfad7a99d0b5170b8b5
Pointer size: 131 Bytes
Size of remote file: 276 kB

assets/tab1_woTitle.png ADDED Viewed

Git LFS Details

SHA256: 537927361e74307d0f19516757a856935e819544e22bc2b7fb0458e363c375e2
Pointer size: 131 Bytes
Size of remote file: 188 kB

assets/tab2.png ADDED Viewed

Git LFS Details

SHA256: 8f17dfbc00cccd7422546273df24004d33ee11c9e45674bea37770162118f409
Pointer size: 131 Bytes
Size of remote file: 805 kB

assets/tab3.png ADDED Viewed

Git LFS Details

SHA256: 62eac94e1636d9a97b6c7f816267548a9725b3304bd01a69a0c9ba163161f5d5
Pointer size: 131 Bytes
Size of remote file: 793 kB

assets/tab4.png ADDED Viewed

Git LFS Details

SHA256: 2421f7d32c75242a3bf24d0fa1c471fa8f51ecff63afaa2e247fcbe50fb604e8
Pointer size: 131 Bytes
Size of remote file: 511 kB

config/data_demo.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+data:
+    path: data/train_data.jsonl
+    use_rand_mask: True
+    rand_mask_config: config/rand_mask_cfg/random_medium_512.yaml
+    use_extra_fg_mask: False
+    extra_ann_files_4_PureBackTrain_2_RandMask: null

config/model_cfg/moebius.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+data:
+  image_size: 512
+vae:
+  model_name: 'sdvae_f8d4'
+  model_dir: ./weight/vae
+  downsample_ratio: 8
+  embed_dim: 4
+model:
+  model_type: UNet2DLambdaDWConvMixFFNConditionModel_prune_down_mid_up_block_8x8
+  in_channels: 9
+  out_channels: 4
+  attention_head_dim: 8
+  conv_in_kernel: 3
+  conv_out_kernel: 3
+  cross_attention_dim: 768
+  encoder_hid_dim: 3072
+  encoder_hid_dim_type: 'text_proj'
+  projection_class_embeddings_input_dim: 2560
+  use_lambda_cross_attn: True
+  use_local_self_attn: True
+  down_block_types:
+    - DWMixTFDownBlock2D
+    - DWMixTFDownBlock2D
+    - DWMixTFDownBlock2D
+  mid_block_type: null
+  up_block_types:
+    - DWMixTFUpBlock2D
+    - DWMixTFUpBlock2D
+    - DWMixTFUpBlock2D
+  block_out_channels:
+    - 320
+    - 640
+    - 1280
+  mix_mlp_ratio: 2.5

config/model_cfg/pixelhacker.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+data:
+  image_size: 512
+vae:
+  model_name: 'sdvae_f8d4'
+  model_dir: ./weight/vae
+  downsample_ratio: 8
+  embed_dim: 4
+model:
+  model_type: UNet2DGLAConditionModel
+  in_channels: 9
+  out_channels: 4
+  attention_head_dim: 8
+  cross_attention_dim: 768
+  encoder_hid_dim: 3072
+  encoder_hid_dim_type: 'text_proj'
+  projection_class_embeddings_input_dim: 2560

config/rand_mask_cfg/random_medium_256.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+generator_kind: random
+mask_generator_kwargs:
+  irregular_proba: 1
+  irregular_kwargs:
+    min_times: 4
+    max_times: 5
+    max_width: 50
+    max_angle: 4
+    max_len: 100
+  box_proba: 0.3
+  box_kwargs:
+    margin: 0
+    bbox_min_size: 10
+    bbox_max_size: 50
+    max_times: 5
+    min_times: 1
+  segm_proba: 0
+  squares_proba: 0
+  # variants_n: 5
+max_masks_per_image: 1
+cropping:
+  out_min_size: 256
+  handle_small_mode: upscale
+  out_square_crop: True
+  crop_min_overlap: 1
+max_tamper_area: 0.5

config/rand_mask_cfg/random_medium_512.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+generator_kind: random
+mask_generator_kwargs:
+  irregular_proba: 1
+  irregular_kwargs:
+    min_times: 4
+    max_times: 10
+    max_width: 100
+    max_angle: 4
+    max_len: 200
+  box_proba: 0.3
+  box_kwargs:
+    margin: 0
+    bbox_min_size: 30
+    bbox_max_size: 150
+    max_times: 5
+    min_times: 1
+  segm_proba: 0
+  squares_proba: 0
+max_masks_per_image: 1
+cropping:
+  out_min_size: 512
+  handle_small_mode: upscale
+  out_square_crop: True
+  crop_min_overlap: 1
+max_tamper_area: 0.5

config/rand_mask_cfg/random_thick_256.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+generator_kind: random
+mask_generator_kwargs:
+  irregular_proba: 1
+  irregular_kwargs:
+    min_times: 1
+    max_times: 5
+    max_width: 100
+    max_angle: 4
+    max_len: 200
+  box_proba: 0.3
+  box_kwargs:
+    margin: 10
+    bbox_min_size: 30
+    bbox_max_size: 150
+    max_times: 3
+    min_times: 1
+  segm_proba: 0
+  squares_proba: 0
+  # variants_n: 5
+max_masks_per_image: 1
+cropping:
+  out_min_size: 256
+  handle_small_mode: upscale
+  out_square_crop: True
+  crop_min_overlap: 1
+max_tamper_area: 0.5

config/rand_mask_cfg/random_thick_512.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+generator_kind: random
+mask_generator_kwargs:
+  irregular_proba: 1
+  irregular_kwargs:
+    min_times: 1
+    max_times: 5
+    max_width: 250
+    max_angle: 4
+    max_len: 450
+  box_proba: 0.3
+  box_kwargs:
+    margin: 10
+    bbox_min_size: 30
+    bbox_max_size: 300
+    max_times: 4
+    min_times: 1
+  segm_proba: 0
+  squares_proba: 0
+  # variants_n: 5
+max_masks_per_image: 1
+cropping:
+  out_min_size: 512
+  handle_small_mode: upscale
+  out_square_crop: True
+  crop_min_overlap: 1
+max_tamper_area: 0.5

config/rand_mask_cfg/random_thin_256.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+generator_kind: random
+mask_generator_kwargs:
+  irregular_proba: 1
+  irregular_kwargs:
+    min_times: 4
+    max_times: 50
+    max_width: 10
+    max_angle: 4
+    max_len: 40
+  box_proba: 0
+  segm_proba: 0
+  squares_proba: 0
+  variants_n: 5
+max_masks_per_image: 1
+cropping:
+  out_min_size: 256
+  handle_small_mode: upscale
+  out_square_crop: True
+  crop_min_overlap: 1
+max_tamper_area: 0.5

config/rand_mask_cfg/random_thin_512.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+generator_kind: random
+mask_generator_kwargs:
+  irregular_proba: 1
+  irregular_kwargs:
+    min_times: 4
+    max_times: 70
+    max_width: 20
+    max_angle: 4
+    max_len: 100
+  box_proba: 0
+  segm_proba: 0
+  squares_proba: 0
+  variants_n: 5
+max_masks_per_image: 1
+cropping:
+  out_min_size: 512
+  handle_small_mode: upscale
+  out_square_crop: True
+  crop_min_overlap: 1
+max_tamper_area: 0.5

config/train_demo.sh ADDED Viewed

	@@ -0,0 +1,45 @@

+# Set WORK_DIR to your project root before running
+THIS_SH_PATH=$CONFIG_FILE
+OUTPUT_DIR='exp_outputs'
+OUTPUT_DIR_EXP_NAME="${OUTPUT_DIR}/${EXP_NAME}"
+export OUTPUT_DIR=$OUTPUT_DIR
+export OUTPUT_DIR_EXP_NAME=$OUTPUT_DIR_EXP_NAME
+export HF_HOME=$HF_HOME
+export TRAIN_ARGS=" --data_type RemovalDataset_v1_2 \
+                    --lognorm_t \
+                    --elatentlpips_loss --elatentlpips_loss_weight 0.5 \
+                    --task_loss         --task_loss_weight 0.5 \
+                    --KD_loss_weight 0.01 \
+                    --mse_feat_loss --feat_loss_weight 1.0 --feat_index_T 5 --feat_index_S 2 \
+                    --model_config_path=config/model_cfg/moebius.yaml \
+                    --teacher_weight_path=../../hf_models/hustvl/PixelHacker/pretrained/diffusion_pytorch_model.bin \
+                    --teacher_config_path=config/model_cfg/pixelhacker.yaml \
+                    --data_config=config/data_demo.yaml \
+                    --num_embeddings 20 \
+                    --image_size 512 \
+                    --batch_size 2 \
+                    --num_workers 4 \
+                    --output_dir=${OUTPUT_DIR_EXP_NAME} \
+                    --output_name=exp \
+                    --seed=42 \
+                    --learning_rate=1e-4 \
+                    --global_step=0 \
+                    --max_train_steps=200000 \
+                    --save_every_n_steps=3000 \
+                    --logging_dir=${OUTPUT_DIR_EXP_NAME}/log \
+                    --gradient_accumulation_steps=1 \
+                    --optimizer_type=Muon \
+                    --lr_scheduler=constant_with_warmup \
+                    --lr_warmup_steps=0 \
+                    --save_precision=bf16 \
+                    --mixed_precision=bf16 \
+                    --noise_offset=0.0357 \
+                    --gradient_checkpointing \
+                    --xformers \
+                    --log_with=tensorboard \
+                    --script_args=$THIS_SH_PATH "

data/images/0.png ADDED Viewed

Git LFS Details

SHA256: a6a27b1be3be48d8d89882dbd66927f5eba5be4a49b471a16853b663dde7a3b4
Pointer size: 131 Bytes
Size of remote file: 400 kB

data/images/1.png ADDED Viewed

Git LFS Details

SHA256: 4750545d31413533e45a29736235654ac9b2c0f1dc1956080406861e37dc74e8
Pointer size: 131 Bytes
Size of remote file: 417 kB

data/images/10.png ADDED Viewed

Git LFS Details

SHA256: bbc5088521255df0f23a55ff2e6941cf9e172f4bfd518701c2b2594e81680e20
Pointer size: 131 Bytes
Size of remote file: 317 kB

data/images/100.png ADDED Viewed

Git LFS Details

SHA256: c112d470d70193d2ab1ea0e8746db8d892b8576568dcb257c91e793e836c323f
Pointer size: 131 Bytes
Size of remote file: 377 kB

data/images/10000.png ADDED Viewed

Git LFS Details

SHA256: 120dcf025efb50a6d220ddba07c435a5d12abc3ce1f00b4b4cc868249d1781c0
Pointer size: 131 Bytes
Size of remote file: 362 kB

data/images/10001.png ADDED Viewed

Git LFS Details

SHA256: 89749ea1a5442b738dba155f55d51a225edd6e2a405032554cff1034a81e542e
Pointer size: 131 Bytes
Size of remote file: 318 kB

data/images/10002.png ADDED Viewed

Git LFS Details

SHA256: a4afea8d4e239d74186308ac0dc1994e8c607caae5c17e63f63a62676401a5f9
Pointer size: 131 Bytes
Size of remote file: 278 kB

data/images/10003.png ADDED Viewed

Git LFS Details

SHA256: a133da270d2019d2b6bdf3741293768f0b1cc845395e6a81c31d0b0107539aad
Pointer size: 131 Bytes
Size of remote file: 225 kB

data/masks/000000.png ADDED Viewed

data/masks/000001.png ADDED Viewed

data/masks/000002.png ADDED Viewed

data/masks/000003.png ADDED Viewed

data/masks/000004.png ADDED Viewed

data/masks/000005.png ADDED Viewed

data/masks/000006.png ADDED Viewed

data/masks/000007.png ADDED Viewed

data/train_data.jsonl ADDED Viewed

	@@ -0,0 +1,8 @@

+{"image": "data/images/0.png", "prompt": "background"}
+{"image": "data/images/1.png", "prompt": "background"}
+{"image": "data/images/10.png", "prompt": "background"}
+{"image": "data/images/100.png", "prompt": "background"}
+{"image": "data/images/10000.png", "prompt": "background"}
+{"image": "data/images/10001.png", "prompt": "background"}
+{"image": "data/images/10002.png", "prompt": "background"}
+{"image": "data/images/10003.png", "prompt": "background"}

infer/__init__.py ADDED Viewed

File without changes

infer/infer_moebius.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from functools import partial
+import os
+from typing import List
+from tqdm import tqdm
+import numpy as np
+import torch
+from PIL import Image
+from pathlib import Path
+from .utils import get_batch_infer_args, build_pipeline, SAVER
+from .utils_dataset import SimpleInferDataset, build_dataloader
+def main():
+    args = get_batch_infer_args()
+    dataloader = build_dataloader(args, SimpleInferDataset)
+    pipe = build_pipeline(args)
+    pipe = partial(pipe,
+            guidance_scale=args.cfg,
+            paste=args.pst,
+            compensate=args.cps,
+            num_steps=args.num_step,
+            noise_offset=args.noise_offset
+            )
+    save_root = Path(args.save_dir)
+    save_root.mkdir(parents=True, exist_ok=True)
+    pbar_loader = tqdm(enumerate(dataloader),
+        total=dataloader.dataset.__len__()//args.batch_size+1)
+    for idx, (images, masks, inames) in pbar_loader:
+        image_inpaint_list = pipe(images, masks)
+        names = [iname+'.png' for iname in inames]
+        SAVER.save_images_mp(image_inpaint_list, names, save_root)
+if __name__ == '__main__':
+    main()

infer/utils.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from functools import partial
+import os
+from typing import List
+from pathlib import Path
+import math
+from tqdm import tqdm
+import numpy as np
+import torch
+from PIL import Image
+def get_batch_infer_args(parser=None):
+    if parser is None:
+        import argparse
+        parser = argparse.ArgumentParser()
+    def str2bool(v):
+        if isinstance(v, bool):
+            return v
+        if v.lower() in ('yes', 'true', 't', 'y', '1'):
+            return True
+        elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+            return False
+        else:
+            raise argparse.ArgumentTypeError('Boolean value expected.')
+    # model argument
+    parser.add_argument("--model-config", type=str, required=False, default=None)
+    parser.add_argument("--model-weight", type=str, required=False, default=None)
+    # sampling argument
+    parser.add_argument("--num-step", type=int, required=False, default=20)
+    parser.add_argument("--cfg", type=float, required=False, default=2.5)
+    parser.add_argument("--pst", type=str2bool, required=False, default=True)
+    parser.add_argument("--cps", type=str2bool, required=False, default=False)
+    parser.add_argument("--noise-offset", type=float, required=False, default=0.0357)
+    parser.add_argument("--seed", type=int, default=0, required=False)
+    # data argument
+    parser.add_argument("--real-dir", type=Path, required=True)
+    parser.add_argument("--mask-dir", type=Path, required=False)
+    parser.add_argument("--resolution", type=int, default=512, required=False)
+    # runtime argument
+    parser.add_argument("--device", type=str, required=False, default="cuda")
+    parser.add_argument("--batch-size", type=int, required=False, default=32)
+    parser.add_argument("--num-workers", type=int, required=False, default=64)
+    # save argument
+    parser.add_argument("--save-dir", type=str, required=True)
+    parser.add_argument("--visualize-latent", action="store_true", default=False)
+    return parser.parse_args()
+def build_pipeline(args):
+    from diffusers import DDIMScheduler
+    from removal.v1_2.pipeline import RemovalSDXLPipeline_BatchMode as Removal_Pipeline
+    from removal.v1_2 import build_removal_model, load_cfg, load_removal_model
+    from utils_train import build_vae
+    model_cfg = load_cfg(args.model_config)
+    removal_model = build_removal_model(model_cfg, 20).to(args.device)
+    print(load_removal_model(removal_model, args.model_weight,args.device))
+    vae = build_vae(model_cfg).to(args.device)
+    scheduler = DDIMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear",
+        num_train_timesteps=1000, clip_sample=False)
+    pipe = Removal_Pipeline(
+        removal_model=removal_model,
+        vae=vae,
+        scheduler=scheduler,
+        device=args.device,
+        dtype=torch.float)
+    return pipe
+class SAVER:
+    @staticmethod
+    def save_image(img, name, path):
+        img.save(path / name)
+        return name
+    @staticmethod
+    def save_images(images:List[Image.Image], names:List[str], save_root:str):
+        assert len(images) == len(names), \
+            f"images and names are not equal: {len(images)}!={len(names)}"
+        pbar_save = tqdm(zip(images, names), total=len(names))
+        cache_names = os.listdir(save_root)
+        for image, name in pbar_save:
+            if name not in cache_names:
+                SAVER.save_image(image, name, save_root)
+    @staticmethod
+    def save_images_mt(images:List[Image.Image], names:List[str], save_root:str, num_workers=8):
+        from concurrent.futures import ThreadPoolExecutor
+        with ThreadPoolExecutor(max_workers=num_workers) as executor:
+            futures = [
+                executor.submit(SAVER.save_image, image, name, save_root) for image, name in zip(images, names)]
+            for future in tqdm(futures):
+                future.result()
+    @staticmethod
+    def save_images_mp(images:List[Image.Image], names:List[str], save_root:str, num_workers=8):
+        from concurrent.futures import ProcessPoolExecutor
+        with ProcessPoolExecutor(max_workers=num_workers) as executor:
+            futures = [
+                executor.submit(SAVER.save_image, image, name, save_root) for image, name in zip(images, names)]
+            for future in tqdm(futures):
+                future.result()

infer/utils_dataset.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# ---------------- Dataset Utils -----------------------
+import warnings
+from pathlib import Path
+from typing import Tuple, Optional
+import math
+import os
+import numpy as np
+import torch
+from PIL import Image, ImageDraw
+from torch.utils.data import Dataset, DataLoader
+warnings.filterwarnings("ignore")
+def RandomBrush(
+    max_tries,
+    s,
+    min_num_vertex=4,
+    max_num_vertex=18,
+    mean_angle=2*math.pi / 5,
+    angle_range=2*math.pi / 15,
+    min_width=12,
+    max_width=48
+):
+    H, W = s, s
+    average_radius = math.sqrt(H*H+W*W) / 8
+    mask = Image.new('L', (W, H), 0)
+    for _ in range(np.random.randint(max_tries)):
+        num_vertex = np.random.randint(min_num_vertex, max_num_vertex)
+        angle_min = mean_angle - np.random.uniform(0, angle_range)
+        angle_max = mean_angle + np.random.uniform(0, angle_range)
+        angles = []
+        vertex = []
+        for i in range(num_vertex):
+            if i % 2 == 0:
+                angles.append(2*math.pi - np.random.uniform(angle_min, angle_max))
+            else:
+                angles.append(np.random.uniform(angle_min, angle_max))
+        h, w = mask.size
+        vertex.append((int(np.random.randint(0, w)), int(np.random.randint(0, h))))
+        for i in range(num_vertex):
+            r = np.clip(
+                np.random.normal(loc=average_radius, scale=average_radius//2),
+                0, 2*average_radius)
+            new_x = np.clip(vertex[-1][0] + r * math.cos(angles[i]), 0, w)
+            new_y = np.clip(vertex[-1][1] + r * math.sin(angles[i]), 0, h)
+            vertex.append((int(new_x), int(new_y)))
+        draw = ImageDraw.Draw(mask)
+        width = int(np.random.uniform(min_width, max_width))
+        draw.line(vertex, fill=1, width=width)
+        for v in vertex:
+            draw.ellipse((v[0] - width//2,
+                          v[1] - width//2,
+                          v[0] + width//2,
+                          v[1] + width//2),
+                         fill=1)
+        if np.random.random() > 0.5:
+            mask.transpose(Image.FLIP_LEFT_RIGHT)
+        if np.random.random() > 0.5:
+            mask.transpose(Image.FLIP_TOP_BOTTOM)
+    mask = np.asarray(mask, np.uint8)
+    if np.random.random() > 0.5:
+        mask = np.flip(mask, 0)
+    if np.random.random() > 0.5:
+        mask = np.flip(mask, 1)
+    return mask
+def RandomMask(s, hole_range=[0,1]):
+    coef = min(hole_range[0] + hole_range[1], 1.0)
+    while True:
+        mask = np.ones((s, s), np.uint8)
+        def Fill(max_size):
+            w, h = np.random.randint(max_size), np.random.randint(max_size)
+            ww, hh = w // 2, h // 2
+            x, y = np.random.randint(-ww, s - w + ww), np.random.randint(-hh, s - h + hh)
+            mask[max(y, 0): min(y + h, s), max(x, 0): min(x + w, s)] = 0
+        def MultiFill(max_tries, max_size):
+            for _ in range(np.random.randint(max_tries)):
+                Fill(max_size)
+        MultiFill(int(10 * coef), s // 2)
+        MultiFill(int(5 * coef), s)
+        mask = np.logical_and(mask, 1 - RandomBrush(int(20 * coef), s))
+        hole_ratio = 1 - np.mean(mask)
+        if hole_range is not None and (hole_ratio <= hole_range[0] or hole_ratio >= hole_range[1]):
+            continue
+        return (mask * 255).astype(np.uint8)
+class InferDataset(Dataset): # ABC
+    img_ext = {".jpg", ".jpeg", ".JPG", ".JPEG", ".png", ".PNG"}
+    def __init__(
+        self,
+        real_dir: Path,
+        mask_dir: Optional[Path] = None,
+        resolution: int = None
+    ):
+        super(InferDataset, self).__init__()
+        self.img_paths = sorted([i for i in Path(real_dir).iterdir() if i.suffix in self.img_ext])
+        self.mask_dir = mask_dir
+        self.resolution = resolution
+    def __len__(self):
+        return len(self.img_paths)
+    def __getitem__(self, index) -> Tuple[torch.Tensor, np.array, np.array, str]:
+        img_path = Path(self.img_paths[index])
+        img_name = img_path.stem
+        img = Image.open(img_path).convert("RGB")
+        if img.size[0] != self.resolution or img.size[1] != self.resolution:
+            img = img.resize((self.resolution, self.resolution), Image.BICUBIC)
+        assert img.size[0] == self.resolution
+        if self.mask_dir is not None:
+            # mask_path = self.mask_dir / f"{img_name}.png"
+            mask_path = self.mask_dir / f"img000{img_name}.png"
+            mask = Image.open(mask_path).convert("L")
+            mask = mask.resize((self.resolution, self.resolution), Image.NEAREST)
+            assert mask.size[0] == self.resolution
+        else:
+            mask = RandomMask(img.size[0])
+            mask = Image.fromarray(mask).convert("L")
+        img = np.array(img)
+        mask = np.array(mask)[:, :, np.newaxis] // 255
+        img = torch.Tensor(img).float() * 2 / 255 - 1
+        mask = torch.Tensor(mask).float()
+        img = img.permute(2, 0, 1)
+        mask = mask.permute(2, 0, 1)
+        x = torch.cat([mask - 0.5, img * mask], dim=0)
+        return x, np.array(img), mask, img_name
+class SimpleInferDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        real_dir: Path,
+        mask_dir: Path = None,
+        resolution: int = 512
+    ):
+        super(SimpleInferDataset, self).__init__()
+        img_extensions = {".jpg", ".jpeg", ".JPG", ".JPEG", ".png", ".PNG"}
+        self.img_paths  = sorted([i for i in Path(real_dir).iterdir() if i.suffix in img_extensions])
+        self.img_dir = real_dir
+        if mask_dir:
+            self.mask_paths = sorted([i for i in Path(mask_dir).iterdir() if i.suffix in img_extensions])
+        self.mask_dir = mask_dir
+        self.resolution = resolution
+    def __getitem__(self, index):
+        img_path = Path(self.img_paths[index])
+        img_name = os.path.basename(img_path)
+        img = Image.open(img_path).convert("RGB")
+        if self.mask_dir:
+            mask_path = Path(self.mask_paths[index])
+            mask = Image.open(mask_path).convert("L")
+        else:
+            mask = RandomMask(img.size[0])
+            mask = Image.fromarray(mask).convert("L")
+        mask = mask.resize((self.resolution, self.resolution), Image.NEAREST)
+        if img.size[0] != self.resolution or img.size[1] != self.resolution:
+            img = img.resize((self.resolution, self.resolution), Image.BICUBIC)
+        return img, mask, img_name
+    def __len__(self):
+        return len(self.img_paths)
+def collate_fn(inputs):
+    image_list = [i[0] for i in inputs]
+    mask_list = [i[1] for i in inputs]
+    iname_list = [i[2] for i in inputs]
+    return image_list, mask_list, iname_list
+def build_dataloader(args, dataset_class=InferDataset):
+    dataset = dataset_class(
+        real_dir=args.real_dir,
+        mask_dir=args.mask_dir,
+        resolution=args.resolution)
+    dataloader = DataLoader(
+        dataset,
+        shuffle=False,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        drop_last=False,
+        collate_fn = collate_fn,
+        pin_memory=True,
+        # persistent_workers=True
+    )
+    return dataloader

library/__init__.py ADDED Viewed

File without changes

library/chinese_sdxl_train_util.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import os
+import sys
+import gc
+import re
+import json
+import math
+import time
+import toml
+import shutil
+import argparse
+from typing import Optional
+from tqdm import tqdm
+from PIL import Image
+import importlib
+import torch
+from torch.utils.tensorboard import SummaryWriter
+from library import train_util
+from transformers import BertTokenizer, BertTokenizerFast, ChineseCLIPTextModel, PreTrainedTokenizerFast, T5Tokenizer, T5ForConditionalGeneration
+from diffusers import (
+    DDPMScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+    DDIMScheduler,
+    EulerDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    AutoencoderKL,
+    UNet2DConditionModel,
+)
+from diffusers.models import UNet2DConditionModel, Transformer2DModel
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from transformers import BertTokenizerFast, ChineseCLIPTextModel
+from library import train_util
+# from mmmp_text import DebertaV2Model
+# from transformers.models.qwen2.modeling_qwen2 import Qwen2Model
+# from diffusers_patch.models.vivo_llm2vec import LLM2VecWithoutPool
+# from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast
+DEFAULT_NOISE_OFFSET = 0.0357
+def load_target_model(args, accelerator, pipe_class, weight_dtype):
+    # load models for each process
+    for pi in range(accelerator.state.num_processes):
+        if pi == accelerator.state.local_process_index:
+            print(f"loading model for process {accelerator.process_index}/{accelerator.state.num_processes}")
+            (
+                text_encoder1,
+                text_encoder2,
+                vae,
+                unet,
+            ) = _load_target_model(
+                args,
+                args.pretrained_model_name_or_path,
+                args.vae,
+                pipe_class,
+                weight_dtype,
+                accelerator.device if args.lowram else "cpu",
+            )
+            gc.collect()
+            torch.cuda.empty_cache()
+    accelerator.wait_for_everyone()
+    return text_encoder1, text_encoder2, vae, unet
+def _load_target_model(args, name_or_path: str, vae_path: Optional[str], pipe_class, weight_dtype, device="cpu"):
+    name_or_path = os.readlink(name_or_path) if os.path.islink(name_or_path) else name_or_path
+    model_index_path = os.path.join(name_or_path, 'model_index.json')
+    model_index = read_json(model_index_path)
+    TextEncoderLib1 = model_index['text_encoder'][0]
+    TextEncoderLib2 = model_index['text_encoder_2'][0]
+    TextEncoderClass1 = model_index['text_encoder'][-1]
+    TextEncoderClass2 = model_index['text_encoder_2'][-1]
+    library1 = importlib.import_module(TextEncoderLib1)
+    library2 = importlib.import_module(TextEncoderLib2)
+    TextEncoderClass1 = getattr(library1, TextEncoderClass1)
+    TextEncoderClass2 = getattr(library2, TextEncoderClass2)
+    if 'unet' in model_index:
+        UNetClass = eval(model_index['unet'][-1])
+        unet_dir = 'unet'
+    elif 'transformer' in model_index:
+        UNetClass = eval(model_index['transformer'][-1])
+        unet_dir = 'transformer'
+    print(f"TextEncoderClass1:{TextEncoderClass1}")
+    print(f"TextEncoderClass2:{TextEncoderClass2}")
+    print(f"UNetClass:{UNetClass}")
+    vae = AutoencoderKL.from_pretrained(os.path.join(name_or_path, 'vae'), torch_dtype=weight_dtype, low_cpu_mem_usage=False, device_map=None)
+    unet = UNetClass.from_pretrained(os.path.join(name_or_path, unet_dir), torch_dtype=weight_dtype, low_cpu_mem_usage=False, device_map=None, ignore_mismatched_sizes=True)
+    text_encoder1 = TextEncoderClass1.from_pretrained(os.path.join(name_or_path, 'text_encoder'), torch_dtype=weight_dtype)
+    text_encoder2 = TextEncoderClass2.from_pretrained(os.path.join(name_or_path, 'text_encoder_2'), torch_dtype=weight_dtype)
+    vae_version = vae.config.version if 'version' in vae.config else ''
+    if vae_version == 'vivo':
+        vae.quant_conv = torch.nn.Identity()
+        vae.post_quant_conv = torch.nn.Identity()
+    return text_encoder1, text_encoder2, vae, unet
+def load_tokenizers(args: argparse.Namespace):
+    print("prepare tokenizers")
+    model_index_path = os.path.join(args.pretrained_model_name_or_path, 'model_index.json')
+    model_index = read_json(model_index_path)
+    ToeknierLib1 = model_index['tokenizer'][0]
+    ToeknierLib2 = model_index['tokenizer_2'][0]
+    TokenierClass1 = model_index['tokenizer'][-1]
+    TokenierClass2 = "BertTokenizer"  # ToDo: model_index['tokenizer_2'][-1]
+    library1 = importlib.import_module(ToeknierLib1)
+    library2 = importlib.import_module(ToeknierLib2)
+    TokenierClass1 = getattr(library1, TokenierClass1)
+    TokenierClass2 = getattr(library2, TokenierClass2)
+    tokenizer_1 = TokenierClass1.from_pretrained(args.pretrained_model_name_or_path, subfolder='tokenizer')
+    tokenizer_2 = TokenierClass2.from_pretrained(args.pretrained_model_name_or_path, subfolder='tokenizer_2')
+    tokeniers = [tokenizer_1, tokenizer_2]
+    if hasattr(args, "max_token_length") and args.max_token_length is not None:
+        print(f"update token length: {args.max_token_length}")
+    return tokeniers
+def get_hidden_states_sdxl(
+    input_ids1: torch.Tensor,
+    input_ids2: torch.Tensor,
+    tokenizer1: BertTokenizerFast,
+    tokenizer2: BertTokenizerFast,
+    text_encoder1: ChineseCLIPTextModel,
+    text_encoder2: ChineseCLIPTextModel,
+    weight_dtype: Optional[str] = None,
+    attention_mask1: torch.Tensor = None,
+    attention_mask2: torch.Tensor = None,
+):
+    # input_ids: b,n,77 -> b*n, 77
+    b_size = input_ids1.size()[0]
+    input_ids1 = input_ids1.reshape((-1, tokenizer1.model_max_length))  # batch_size*n, 77
+    input_ids2 = input_ids2.reshape((-1, tokenizer2.model_max_length))  # batch_size*n, 77
+    if attention_mask1 is not None:
+        attention_mask1 = attention_mask1.reshape((-1, tokenizer1.model_max_length))
+        attention_mask2 = attention_mask2.reshape((-1, tokenizer2.model_max_length))
+    hidden_states1, _ = encode_token(input_ids1, attention_mask1, text_encoder1)
+    hidden_states2, pool2 = encode_token(input_ids2, attention_mask2, text_encoder2)
+    hidden_states1 = hidden_states1.reshape((b_size, -1, hidden_states1.shape[-1]))
+    hidden_states2 = hidden_states2.reshape((b_size, -1, hidden_states2.shape[-1]))
+    if weight_dtype is not None:
+        # this is required for additional network training
+        hidden_states1 = hidden_states1.to(weight_dtype)
+        hidden_states2 = hidden_states2.to(weight_dtype)
+    return hidden_states1, hidden_states2, pool2
+def encode_token(input_ids, attention_mask, text_encoder):
+    # T5
+    if isinstance(text_encoder, T5ForConditionalGeneration):
+        prompt_embeds = text_encoder.encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )
+        pooled_prompt_embeds = None
+        prompt_embeds = prompt_embeds.hidden_states[-1]
+    # clip Bert
+    elif isinstance(text_encoder, ChineseCLIPTextModel):
+        prompt_embeds = text_encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )
+        # We are only ALWAYS interested in the pooled output of the final text encoder
+        pooled_prompt_embeds = prompt_embeds['pooler_output']
+        prompt_embeds = prompt_embeds.hidden_states[-2]
+    # 3mp_Bert\Qwen2Model\LLM2VecWithoutPool\GLMModel
+    else:
+        prompt_embeds = text_encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )
+        if 'last_hidden_states' in prompt_embeds:
+            prompt_embeds = prompt_embeds.last_hidden_states
+        else:
+            prompt_embeds = prompt_embeds.last_hidden_state
+        pooled_prompt_embeds = prompt_embeds.mean(dim=1)
+    return prompt_embeds, pooled_prompt_embeds
+def prepare_logging(args: argparse.Namespace, is_main_process):
+    if args.logging_dir is None:
+        logging_dir = None
+    else:
+        log_prefix = "" if args.log_prefix is None else args.log_prefix
+        logging_dir = args.logging_dir + "/" + log_prefix + time.strftime("%Y%m%d%H%M%S", time.localtime())
+    log_with = args.log_with
+    if log_with in ["tensorboard", "all"]:
+        if logging_dir is None:
+            raise ValueError("logging_dir is required when log_with is tensorboard / Tensorboardを使う場合、logging_dirを指定してください")
+    tensorboard_dir = os.path.join(logging_dir, 'tensorboard')
+    writer = None
+    if is_main_process:
+        os.makedirs(logging_dir, exist_ok=True)
+        os.makedirs(tensorboard_dir, exist_ok=True)
+        if args.script_args:
+            sh_basename = os.path.basename(args.script_args)
+            sh_dst_path = os.path.join(logging_dir, sh_basename)
+            data_basename = os.path.basename(args.dataset_config)
+            data_dst_path = os.path.join(logging_dir, data_basename)
+            shutil.copyfile(args.script_args, sh_dst_path)
+            shutil.copyfile(args.dataset_config, data_dst_path)
+        writer = SummaryWriter(tensorboard_dir)
+    return writer
+def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCaching: bool = True):
+    if args.clip_skip is not None:
+        print("clip_skip will be unexpected / SDXL学習ではclip_skipは動作しません")
+    if args.multires_noise_iterations:
+        print(
+            f"Warning: SDXL has been trained with noise_offset={DEFAULT_NOISE_OFFSET}, but noise_offset is disabled due to multires_noise_iterations"
+        )
+    else:
+        if args.noise_offset is None:
+            args.noise_offset = DEFAULT_NOISE_OFFSET
+        elif args.noise_offset != DEFAULT_NOISE_OFFSET:
+            print(
+                f"Warning: SDXL has been trained with noise_offset={DEFAULT_NOISE_OFFSET} / SDXLはnoise_offset={DEFAULT_NOISE_OFFSET}で学習されています"
+            )
+        print(f"noise_offset is set to {args.noise_offset}")
+    assert (
+        not hasattr(args, "weighted_captions") or not args.weighted_captions
+    ), "weighted_captions cannot be enabled in SDXL training currently / SDXL学習では今のところweighted_captionsを有効にすることはできません"
+    if supportTextEncoderCaching:
+        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+            args.cache_text_encoder_outputs = True
+            print(
+                "cache_text_encoder_outputs is enabled because cache_text_encoder_outputs_to_disk is enabled / "
+                + "cache_text_encoder_outputs_to_diskが有効になっているためcache_text_encoder_outputsが有効になりました"
+            )
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        device=timesteps.device
+    )
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+def get_timestep_embedding(x, outdim):
+    assert len(x.shape) == 2
+    b, dims = x.shape[0], x.shape[1]
+    x = torch.flatten(x)
+    emb = timestep_embedding(x, outdim)
+    emb = torch.reshape(emb, (b, dims * outdim))
+    return emb
+def get_size_embeddings(orig_size, crop_size, target_size, device):
+    emb1 = get_timestep_embedding(orig_size, 256)
+    emb2 = get_timestep_embedding(crop_size, 256)
+    emb3 = get_timestep_embedding(target_size, 256)
+    vector = torch.cat([emb1, emb2, emb3], dim=1).to(device)
+    return vector
+def add_sdxl_training_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
+    )
+    parser.add_argument(
+        "--cache_text_encoder_outputs_to_disk",
+        action="store_true",
+        help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
+    )
+def set_unet_eff_attn(unet, mem_eff_attn, xformers, sdpa):
+    if mem_eff_attn:
+        print("Enable memory efficient attention for U-Net")
+        unet.set_use_memory_efficient_attention_xformers(False, True)
+    elif xformers:
+        print("Enable xformers for U-Net")
+        try:
+            import xformers.ops
+        except ImportError:
+            raise ImportError("No xformers / xformersがインストールされていないようです")
+        unet.set_use_memory_efficient_attention_xformers(True, False)
+    elif sdpa:
+        print("Enable SDPA for U-Net")
+        unet.set_use_sdpa(True)
+def set_diffusers_xformers_flag(model, valid):
+    def fn_recursive_set_mem_eff(module: torch.nn.Module):
+        if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+            module.set_use_memory_efficient_attention_xformers(valid)
+        for child in module.children():
+            fn_recursive_set_mem_eff(child)
+    fn_recursive_set_mem_eff(model)
+def read_json(json_path):
+    return json.load(open(json_path))

library/custom_train_functions.py ADDED Viewed

	@@ -0,0 +1,515 @@

+import torch
+import argparse
+import random
+import re
+from typing import List, Optional, Union
+def prepare_scheduler_for_custom_training(noise_scheduler, device):
+    if hasattr(noise_scheduler, "all_snr"):
+        return
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
+    sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
+    alpha = sqrt_alphas_cumprod
+    sigma = sqrt_one_minus_alphas_cumprod
+    all_snr = (alpha / sigma) ** 2
+    noise_scheduler.all_snr = all_snr.to(device)
+def fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler):
+    # fix beta: zero terminal SNR
+    print(f"fix noise scheduler betas: https://arxiv.org/abs/2305.08891")
+    def enforce_zero_terminal_snr(betas):
+        # Convert betas to alphas_bar_sqrt
+        alphas = 1 - betas
+        alphas_bar = alphas.cumprod(0)
+        alphas_bar_sqrt = alphas_bar.sqrt()
+        # Store old values.
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+        # Shift so last timestep is zero.
+        alphas_bar_sqrt -= alphas_bar_sqrt_T
+        # Scale so first timestep is back to old value.
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+        # Convert alphas_bar_sqrt to betas
+        alphas_bar = alphas_bar_sqrt**2
+        alphas = alphas_bar[1:] / alphas_bar[:-1]
+        alphas = torch.cat([alphas_bar[0:1], alphas])
+        betas = 1 - alphas
+        return betas
+    betas = noise_scheduler.betas
+    betas = enforce_zero_terminal_snr(betas)
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    # print("original:", noise_scheduler.betas)
+    # print("fixed:", betas)
+    noise_scheduler.betas = betas
+    noise_scheduler.alphas = alphas
+    noise_scheduler.alphas_cumprod = alphas_cumprod
+def apply_snr_weight(loss, timesteps, noise_scheduler, gamma):
+    snr = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])
+    gamma_over_snr = torch.div(torch.ones_like(snr) * gamma, snr)
+    snr_weight = torch.minimum(gamma_over_snr, torch.ones_like(gamma_over_snr)).float().to(loss.device)  # from paper
+    loss = loss * snr_weight
+    return loss
+def scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler):
+    scale = get_snr_scale(timesteps, noise_scheduler)
+    loss = loss * scale
+    return loss
+def get_snr_scale(timesteps, noise_scheduler):
+    snr_t = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])  # batch_size
+    snr_t = torch.minimum(snr_t, torch.ones_like(snr_t) * 1000)  # if timestep is 0, snr_t is inf, so limit it to 1000
+    scale = snr_t / (snr_t + 1)
+    # # show debug info
+    # print(f"timesteps: {timesteps}, snr_t: {snr_t}, scale: {scale}")
+    return scale
+def add_v_prediction_like_loss(loss, timesteps, noise_scheduler, v_pred_like_loss):
+    scale = get_snr_scale(timesteps, noise_scheduler)
+    # print(f"add v-prediction like loss: {v_pred_like_loss}, scale: {scale}, loss: {loss}, time: {timesteps}")
+    loss = loss + loss / scale * v_pred_like_loss
+    return loss
+# TODO train_utilと分散しているのでどちらかに寄せる
+def add_custom_train_arguments(parser: argparse.ArgumentParser, support_weighted_captions: bool = True):
+    parser.add_argument(
+        "--min_snr_gamma",
+        type=float,
+        default=None,
+        help="gamma for reducing the weight of high loss timesteps. Lower numbers have stronger effect. 5 is recommended by paper. / 低いタイムステップでの高いlossに対して重みを減らすためのgamma値、低いほど効果が強く、論文では5が推奨",
+    )
+    parser.add_argument(
+        "--scale_v_pred_loss_like_noise_pred",
+        action="store_true",
+        help="scale v-prediction loss like noise prediction loss / v-prediction lossをnoise prediction lossと同じようにスケーリングする",
+    )
+    parser.add_argument(
+        "--v_pred_like_loss",
+        type=float,
+        default=None,
+        help="add v-prediction like loss multiplied by this value / v-prediction lossをこの値をかけたものをlossに加算する",
+    )
+    if support_weighted_captions:
+        parser.add_argument(
+            "--weighted_captions",
+            action="store_true",
+            default=False,
+            help="Enable weighted captions in the standard style (token:1.3). No commas inside parens, or shuffle/dropout may break the decoder. / 「[token]」、「(token)」「(token:1.3)」のような重み付きキャプションを有効にする。カンマを括弧内に入れるとシャッフルやdropoutで重みづけがおかしくなるので注意",
+        )
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res
+def get_prompts_with_weights(tokenizer, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        print("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
+    return tokens, weights
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+    return tokens, weights
+def get_unweighted_text_embeddings(
+    tokenizer,
+    text_encoder,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    clip_skip: int,
+    eos: int,
+    pad: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            if pad == eos:  # v1
+                text_input_chunk[:, -1] = text_input[0, -1]
+            else:  # v2
+                for j in range(len(text_input_chunk)):
+                    if text_input_chunk[j, -1] != eos and text_input_chunk[j, -1] != pad:  # 最後に普通の文字がある
+                        text_input_chunk[j, -1] = eos
+                    if text_input_chunk[j, 1] == pad:  # BOSだけであとはPAD
+                        text_input_chunk[j, 1] = eos
+            if clip_skip is None or clip_skip == 1:
+                text_embedding = text_encoder(text_input_chunk)[0]
+            else:
+                enc_out = text_encoder(text_input_chunk, output_hidden_states=True, return_dict=True)
+                text_embedding = enc_out["hidden_states"][-clip_skip]
+                text_embedding = text_encoder.text_model.final_layer_norm(text_embedding)
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+            text_embeddings.append(text_embedding)
+        text_embeddings = torch.concat(text_embeddings, axis=1)
+    else:
+        if clip_skip is None or clip_skip == 1:
+            text_embeddings = text_encoder(text_input)[0]
+        else:
+            enc_out = text_encoder(text_input, output_hidden_states=True, return_dict=True)
+            text_embeddings = enc_out["hidden_states"][-clip_skip]
+            text_embeddings = text_encoder.text_model.final_layer_norm(text_embeddings)
+    return text_embeddings
+def get_weighted_text_embeddings(
+    tokenizer,
+    text_encoder,
+    prompt: Union[str, List[str]],
+    device,
+    max_embeddings_multiples: Optional[int] = 3,
+    no_boseos_middle: Optional[bool] = False,
+    clip_skip=None,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
+    Args:
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    prompt_tokens, prompt_weights = get_prompts_with_weights(tokenizer, prompt, max_length - 2)
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (tokenizer.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    # pad the length of tokens and weights
+    bos = tokenizer.bos_token_id
+    eos = tokenizer.eos_token_id
+    pad = tokenizer.pad_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=tokenizer.model_max_length,
+    )
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=device)
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        tokenizer,
+        text_encoder,
+        prompt_tokens,
+        tokenizer.model_max_length,
+        clip_skip,
+        eos,
+        pad,
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=device)
+    # assign weights to the prompts and normalize in the sense of mean
+    previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+    text_embeddings = text_embeddings * prompt_weights.unsqueeze(-1)
+    current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+    text_embeddings = text_embeddings * (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+    return text_embeddings
+# https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2
+def pyramid_noise_like(noise, device, iterations=6, discount=0.4):
+    b, c, w, h = noise.shape  # EDIT: w and h get over-written, rename for a different variant!
+    u = torch.nn.Upsample(size=(w, h), mode="bilinear").to(device)
+    for i in range(iterations):
+        r = random.random() * 2 + 2  # Rather than always going 2x,
+        wn, hn = max(1, int(w / (r**i))), max(1, int(h / (r**i)))
+        noise += u(torch.randn(b, c, wn, hn).to(device)) * discount**i
+        if wn == 1 or hn == 1:
+            break  # Lowest resolution is 1x1
+    return noise / noise.std()  # Scaled back to roughly unit variance
+# https://www.crosslabs.org//blog/diffusion-with-offset-noise
+def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):
+    if noise_offset is None:
+        return noise
+    if adaptive_noise_scale is not None:
+        # latent shape: (batch_size, channels, height, width)
+        # abs mean value for each channel
+        latent_mean = torch.abs(latents.mean(dim=(2, 3), keepdim=True))
+        # multiply adaptive noise scale to the mean value and add it to the noise offset
+        noise_offset = noise_offset + adaptive_noise_scale * latent_mean
+        noise_offset = torch.clamp(noise_offset, 0.0, None)  # in case of adaptive noise scale is negative
+    noise = noise + noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+    return noise
+"""
+##########################################
+# Perlin Noise
+def rand_perlin_2d(device, shape, res, fade=lambda t: 6 * t**5 - 15 * t**4 + 10 * t**3):
+    delta = (res[0] / shape[0], res[1] / shape[1])
+    d = (shape[0] // res[0], shape[1] // res[1])
+    grid = (
+        torch.stack(
+            torch.meshgrid(torch.arange(0, res[0], delta[0], device=device), torch.arange(0, res[1], delta[1], device=device)),
+            dim=-1,
+        )
+        % 1
+    )
+    angles = 2 * torch.pi * torch.rand(res[0] + 1, res[1] + 1, device=device)
+    gradients = torch.stack((torch.cos(angles), torch.sin(angles)), dim=-1)
+    tile_grads = (
+        lambda slice1, slice2: gradients[slice1[0] : slice1[1], slice2[0] : slice2[1]]
+        .repeat_interleave(d[0], 0)
+        .repeat_interleave(d[1], 1)
+    )
+    dot = lambda grad, shift: (
+        torch.stack((grid[: shape[0], : shape[1], 0] + shift[0], grid[: shape[0], : shape[1], 1] + shift[1]), dim=-1)
+        * grad[: shape[0], : shape[1]]
+    ).sum(dim=-1)
+    n00 = dot(tile_grads([0, -1], [0, -1]), [0, 0])
+    n10 = dot(tile_grads([1, None], [0, -1]), [-1, 0])
+    n01 = dot(tile_grads([0, -1], [1, None]), [0, -1])
+    n11 = dot(tile_grads([1, None], [1, None]), [-1, -1])
+    t = fade(grid[: shape[0], : shape[1]])
+    return 1.414 * torch.lerp(torch.lerp(n00, n10, t[..., 0]), torch.lerp(n01, n11, t[..., 0]), t[..., 1])
+def rand_perlin_2d_octaves(device, shape, res, octaves=1, persistence=0.5):
+    noise = torch.zeros(shape, device=device)
+    frequency = 1
+    amplitude = 1
+    for _ in range(octaves):
+        noise += amplitude * rand_perlin_2d(device, shape, (frequency * res[0], frequency * res[1]))
+        frequency *= 2
+        amplitude *= persistence
+    return noise
+def perlin_noise(noise, device, octaves):
+    _, c, w, h = noise.shape
+    perlin = lambda: rand_perlin_2d_octaves(device, (w, h), (4, 4), octaves)
+    noise_perlin = []
+    for _ in range(c):
+        noise_perlin.append(perlin())
+    noise_perlin = torch.stack(noise_perlin).unsqueeze(0)   # (1, c, w, h)
+    noise += noise_perlin # broadcast for each batch
+    return noise / noise.std()  # Scaled back to roughly unit variance
+"""

library/train_util.py ADDED Viewed

The diff for this file is too large to render. See raw diff