Spaces:

PAIR
/

FlowDIS

Running on Zero

App Files Files Community

AndranikSargsyan commited on 4 days ago

Commit

a8a9bce

0 Parent(s):

Add FlowDIS inference and demo

Browse files

Files changed (29) hide show

.gitattributes +5 -0
.gitignore +43 -0
APACHE-2.0-LICENSE +201 -0
LICENSE +311 -0
README.md +13 -0
app.py +480 -0
assets/examples/0.jpg +3 -0
assets/examples/1.jpg +3 -0
assets/examples/2.png +3 -0
assets/examples/3.jpg +3 -0
assets/examples/4.jpg +3 -0
assets/examples/5.jpg +3 -0
assets/examples/6.jpg +3 -0
assets/examples/examples.csv +8 -0
assets/examples/prompts.json +9 -0
assets/preview.png +3 -0
flowdis/__init__.py +16 -0
flowdis/autoencoder.py +318 -0
flowdis/conditioner.py +44 -0
flowdis/configs.py +32 -0
flowdis/layers.py +263 -0
flowdis/loaders.py +75 -0
flowdis/math.py +30 -0
flowdis/model.py +118 -0
flowdis/sampling.py +136 -0
flowdis/util.py +116 -0
pyproject.toml +50 -0
qwen.py +73 -0
requirements.txt +13 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,5 @@

+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,43 @@

+# Gradio temporary files
+gradio_temp/
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+ENV/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+outputs/
+.gradio/

APACHE-2.0-LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

LICENSE ADDED Viewed

	@@ -0,0 +1,311 @@

+               PicsArt Inc. FlowDIS Model License v1.0
+                    Non-Commercial Use License
+PicsArt Inc. ("PicsArt," "we," "our," or "Company") makes the weights,
+parameters, and inference code for FlowDIS (as defined below)
+available for your non-commercial and non-production use under the
+terms of this License.
+FlowDIS is a derivative of FLUX.1 [schnell] by Black Forest Labs,
+Inc., which is licensed under the Apache License, Version 2.0. The
+original FLUX.1 [schnell] model and its associated copyright, patent,
+trademark, and attribution notices are included with this
+distribution. A copy of the Apache License, Version 2.0 is provided
+in the accompanying APACHE-2.0-LICENSE file. This model contains
+modifications made by PicsArt Inc. to the original FLUX.1 [schnell]
+model. By downloading, accessing, using, Distributing, or creating a
+Derivative of FlowDIS, you agree to the terms of this License. If you
+do not agree, you have no rights to access, use, Distribute, or
+create a Derivative of FlowDIS and must immediately cease using it.
+If you accept this License on behalf of your employer or another
+entity, you represent and warrant that you have full legal authority
+to bind that employer or entity.
+1. Definitions
+   (a) "Derivative" means any (i) modified version of FlowDIS (including
+       any fine-tuned or distilled version), (ii) work based on FlowDIS,
+       or (iii) any other derivative work thereof. For clarity, Outputs
+       are not Derivatives.
+   (b) "Distribution" or "Distribute" means providing or making
+       available, by any means, a copy of FlowDIS and/or Derivatives.
+   (c) "Non-Commercial Purpose" means any of the following uses, but
+       only so far as you do not receive any direct or indirect payment
+       arising from the use of FlowDIS or Derivatives:
+       (i) personal use for research, experimentation, and testing for the
+           benefit of public knowledge, personal study, private
+           entertainment, hobby projects, or otherwise not directly or
+           indirectly connected to any commercial activities, business
+           operations, or employment responsibilities;
+      (ii) use by commercial or for-profit entities for testing,
+           evaluation, or non-commercial research and development in a
+           non-production environment; and
+     (iii) use by any charitable organization for charitable purposes,
+           or for testing or evaluation. For clarity, use (a) for
+           revenue-generating activity, (b) in direct interactions with
+           or that has impact on end users, or (c) to train, fine-tune,
+           or distill other models for commercial use, in each case, is
+           not a Non-Commercial Purpose.
+   (d) "Outputs" means any content generated by the operation of FlowDIS
+       or Derivatives from an input or prompt provided by users. Outputs
+       do not include any components of FlowDIS such as fine-tuned
+       versions, weights, or parameters.
+   (e) "you" or "your" means the individual or entity entering into this
+       License with Company.
+2. License Grant
+   (a) License. Subject to your compliance with this License, Company
+       grants you a non-exclusive, worldwide, non-transferable,
+       non-sublicensable, revocable, royalty-free, and limited license
+       to access, use, create Derivatives of, and Distribute FlowDIS and
+       Derivatives solely for Non-Commercial Purposes. This license is
+       personal to you, and you may not assign or sublicense this
+       License or any rights or obligations under it without Company's
+       prior written consent; any such assignment or sublicense will be
+       void and will automatically and immediately terminate this
+       License. Any restrictions set forth herein regarding FlowDIS also
+       apply to any Derivative you create or that is created on your
+       behalf.
+   (b) Non-Commercial Use Only. You may only access, use, Distribute, or
+       create Derivatives of FlowDIS or Derivatives for Non-Commercial
+       Purposes. If you wish to use FlowDIS or a Derivative for any
+       purpose not expressly authorized under this License, you must
+       request a license from Company, which Company may grant in its
+       sole discretion and which may be subject to a fee, royalty, or
+       other revenue share.
+   (c) Reserved Rights. The grant of rights expressly set forth in this
+       License constitutes the complete grant of rights to use FlowDIS,
+       and no other licenses are granted, whether by waiver, estoppel,
+       implication, equity, or otherwise. Company and its licensors
+       reserve all rights not expressly granted by this License.
+   (d) Outputs. Company claims no ownership rights in Outputs. You are
+       solely responsible for the Outputs you generate and their
+       subsequent uses in accordance with this License. You may use
+       Outputs for any purpose (including commercial purposes), except
+       as expressly prohibited herein.
+3. Distribution
+   Subject to this License, you may Distribute copies of FlowDIS and/or
+   Derivatives made by you under the following conditions:
+   (a) You must make available a copy of this License to third-party
+       recipients of FlowDIS and/or Derivatives you Distribute, and
+       specify that any rights to use FlowDIS and/or Derivatives shall
+       be granted directly by Company to said third-party recipients
+       pursuant to this License.
+   (b) You must prominently display the following notice alongside the
+       Distribution (such as via a "NOTICE" text file distributed as
+       part of FlowDIS or the Derivative) (the "Attribution Notice"):
+       This model is licensed by PicsArt Inc. under the PicsArt Inc.
+       FlowDIS Model License v1.0. Copyright 2026 PicsArt Inc. This
+       model is a derivative of FLUX.1 [schnell] by Black Forest Labs,
+       Inc., licensed under the Apache License, Version 2.0. IN NO EVENT
+       SHALL PICSART INC. BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+       LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+       ARISING FROM, OUT OF OR IN CONNECTION WITH USE OF THIS MODEL.
+   (c) In the case of Distribution of Derivatives made by you: (i) you
+       must include in the Attribution Notice a statement that you have
+       modified FlowDIS; (ii) any terms and conditions you impose on
+       third-party recipients relating to your Derivatives shall neither
+       limit such recipients' use of FlowDIS or any Derivatives made by
+       Company in accordance with this License, nor conflict with any of
+       its terms and conditions, and must include disclaimer of
+       warranties and limitation of liability provisions at least as
+       protective of Company as those set forth herein; and (iii) you
+       must not misrepresent or imply that Derivatives made by or for
+       you are an official product of PicsArt Inc. or have been
+       endorsed, approved, or validated by PicsArt Inc., unless
+       authorized by Company in writing.
+   (d) Apache 2.0 Compliance. All Distributions must include: (i) a copy
+       of the Apache License, Version 2.0; (ii) all copyright, patent,
+       trademark, and attribution notices from the original FLUX.1
+       [schnell] model; and (iii) prominent notices stating that the
+       files have been modified from the original FLUX.1 [schnell]
+       model.
+4. Restrictions
+   You will not, and will not permit, assist, or cause any third party
+   to:
+   (a) use, modify, copy, reproduce, create Derivatives of, or
+       Distribute FlowDIS (or any Derivative or data produced by
+       FlowDIS), in whole or in part, for:
+       (i) any commercial or production purpose;
+      (ii) any military purpose, including research, development,
+           design, manufacture, production, or use of weapons, weapons
+           systems, munitions, or any military or defense applications;
+     (iii) purposes of surveillance, including any research or
+           development relating to surveillance;
+      (iv) biometric processing;
+       (v) any manner that infringes, misappropriates, or otherwise violates
+           any third party's legal rights, including rights of publicity or
+           digital replica rights;
+      (vi) any unlawful, fraudulent, defamatory, or abusive activity;
+     (vii) generating unlawful content, including child sexual abuse
+           material or non-consensual intimate images; or
+    (viii) any manner that violates any applicable law, privacy or
+           security laws, rules, regulations, directives, or
+           governmental requirements (including the GDPR, the California
+           Consumer Privacy Act, laws governing the processing of
+           biometric information, and the EU AI Act, as well as all
+           amendments and successor laws to any of the foregoing);
+   (b) alter or remove copyright and other proprietary notices which
+       appear on or in any portion of FlowDIS;
+   (c) utilize any equipment, device, software, or other means to
+       circumvent or remove any security or protection used by Company
+       in connection with FlowDIS, or to circumvent or remove any usage
+       restrictions, or to enable functionality disabled by Company;
+   (d) offer or impose any terms on FlowDIS that alter, restrict, or are
+       inconsistent with the terms of this License;
+   (e) violate any applicable U.S. and non-U.S. export control and trade
+       sanctions laws ("Export Laws") in connection with your use or
+       Distribution of FlowDIS; or
+   (f) directly or indirectly Distribute, export, or otherwise transfer
+       FlowDIS (i) to any individual, entity, or country prohibited by
+       Export Laws; (ii) to anyone on U.S. or non-U.S. government
+       restricted parties lists; (iii) for any purpose prohibited by
+       Export Laws, including nuclear, chemical or biological weapons,
+       or missile technology applications; (iv) if you or they are
+       located in a comprehensively sanctioned jurisdiction, currently
+       listed on any U.S. or non-U.S. restricted parties list, or for
+       any purpose prohibited by Export Laws; or (v) while disguising
+       your location through IP proxying or other methods.
+5. Disclaimers
+   THE MODEL IS PROVIDED "AS IS" AND "WITH ALL FAULTS" WITH NO WARRANTY
+   OF ANY KIND, EXPRESS OR IMPLIED. COMPANY EXPRESSLY DISCLAIMS ALL
+   REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY
+   STATUTE, CUSTOM, USAGE OR OTHERWISE, INCLUDING BUT NOT LIMITED TO THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+   PURPOSE, TITLE, SATISFACTORY QUALITY, OR NON-INFRINGEMENT. COMPANY
+   MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE MODEL WILL BE ERROR
+   FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR PRODUCE ANY
+   PARTICULAR RESULTS.
+6. Limitation of Liability
+   TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL COMPANY BE
+   LIABLE TO YOU OR YOUR EMPLOYEES, AFFILIATES, USERS, OFFICERS, OR
+   DIRECTORS (A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN
+   CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE
+   UNDER THIS LICENSE, OR (B) FOR ANY INDIRECT, CONSEQUENTIAL,
+   EXEMPLARY, INCIDENTAL, PUNITIVE, OR SPECIAL DAMAGES OR LOST PROFITS,
+   EVEN IF COMPANY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+   THE MODEL, ITS CONSTITUENT COMPONENTS, AND ANY OUTPUT (COLLECTIVELY,
+   "MODEL MATERIALS") ARE NOT DESIGNED OR INTENDED FOR USE IN ANY
+   APPLICATION OR SITUATION WHERE FAILURE OR FAULT COULD REASONABLY BE
+   ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON, INCLUDING
+   POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL'S PRIVACY
+   RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE
+   (EACH, A "HIGH-RISK USE"). IF YOU ELECT TO USE ANY MODEL MATERIALS
+   FOR A HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN
+   AND IMPLEMENT APPROPRIATE DECISION-MAKING AND RISK-MITIGATION
+   PROCEDURES AND POLICIES IN CONNECTION WITH A HIGH-RISK USE.
+7. Indemnification
+   You will indemnify, defend, and hold harmless Company and its
+   subsidiaries and affiliates, and each of their respective
+   shareholders, directors, officers, employees, agents, successors, and
+   assigns (collectively, the "Company Parties") from and against any
+   losses, liabilities, damages, fines, penalties, and expenses
+   (including reasonable attorneys' fees) incurred by any Company Party
+   in connection with any claim, demand, allegation, lawsuit,
+   proceeding, or investigation ("Claims") arising out of or related to:
+   (a) your access to or use of FlowDIS (including any Output or data
+   generated from such use), including any High-Risk Use; (b) your
+   violation of this License; or (c) your violation, misappropriation,
+   or infringement of any rights of another (including intellectual
+   property or other proprietary rights and privacy rights). You will
+   promptly notify the Company Parties of any such Claims and cooperate
+   with Company Parties in defending such Claims. You will also grant
+   the Company Parties sole control of the defense or settlement, at
+   Company's sole option, of any Claims. This indemnity is in addition
+   to, and not in lieu of, any other indemnities or remedies set forth
+   in a written agreement between you and Company or the other Company
+   Parties.
+8. Termination; Survival
+   (a) This License will automatically terminate upon any breach by you
+       of the terms of this License.
+   (b) Company may terminate this License, in whole or in part, at any
+       time upon notice (including electronic) to you.
+   (c) If you initiate any legal action or proceedings against Company
+       or any other entity (including a cross-claim or counterclaim),
+       alleging that FlowDIS, any Derivative, or any part thereof,
+       infringes upon intellectual property or other rights owned or
+       licensable by you, then any licenses granted to you under this
+       License will immediately terminate as of the date such legal
+       action or claim is filed.
+   (d) Upon termination, you must cease all use, access, or Distribution
+       of FlowDIS and any Derivatives. Sections 2(c), 2(d), 4 through 11
+       survive termination.
+9. Third-Party Materials
+   FlowDIS is derived from FLUX.1 [schnell] by Black Forest Labs, Inc.,
+   and may contain additional third-party software or components
+   (including free and open-source software) ("Third-Party Materials"),
+   which are subject to the license terms of the respective third-party
+   licensors. Your dealings or correspondence with third parties and
+   your use of or interaction with any Third-Party Materials are solely
+   between you and the third party. Company does not control or endorse,
+   and makes no representations or warranties regarding, any Third-Party
+   Materials, and your access to and use of such Third-Party Materials
+   are at your own risk.
+10. Trademarks
+   No trademark license is granted as part of this License. You may not
+   use any name, logo, or trademark associated with PicsArt Inc. without
+   Company's prior written permission, except to the extent necessary to
+   make the reference required in the Attribution Notice or as is
+   reasonably necessary in describing FlowDIS and its creators.
+11. General
+   This License will be governed and construed under the laws of the
+   State of Delaware without regard to conflicts of law provisions. If
+   any provision or part of a provision of this License is unlawful,
+   void, or unenforceable, that provision or part is deemed severed from
+   this License and will not affect the validity and enforceability of
+   any remaining provisions. The failure of Company to exercise or
+   enforce any right or provision of this License will not operate as a
+   waiver of such right or provision. This License does not confer any
+   third-party beneficiary rights upon any other person or entity. This
+   License, together with any accompanying documentation, contains the
+   entire understanding between you and Company regarding its subject
+   matter and supersedes all other written or oral agreements and
+   understandings between you and Company regarding such subject matter.

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: FlowDIS
+emoji: 🌀
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: 6.3.0
+python_version: 3.12
+app_file: app.py
+pinned: true
+thumbnail: assets/preview.png
+---
+Paper: https://arxiv.org/abs/2605.05077

app.py ADDED Viewed

	@@ -0,0 +1,480 @@

+import csv
+import os
+import logging
+import uuid
+import shutil
+from copy import deepcopy
+from pathlib import Path
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+# Set Gradio temp directory BEFORE importing gradio to avoid permission issues
+TEMP_DIR = Path(__file__).parent / "gradio_temp"
+if TEMP_DIR.exists():
+    shutil.rmtree(str(TEMP_DIR))
+TEMP_DIR.mkdir(exist_ok=True)
+os.environ["GRADIO_TEMP_DIR"] = str(TEMP_DIR)
+os.environ["TMPDIR"] = str(TEMP_DIR)
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image
+IS_HF_SPACE = os.environ.get("SPACE_ID") is not None
+try:
+    import spaces
+    zero_gpu = spaces.GPU
+except ImportError:
+    zero_gpu = lambda f: f
+from flowdis.sampling import flowdis_predict
+from flowdis.util import load_models
+from qwen import expand_prompt
+models = None
+device = "cuda"
+if torch.cuda.is_available():
+    models = load_models(device=device)
+else:
+    print("No GPU available, the demo will not be able to run.")
+def disable_download_btn():
+    return gr.update(interactive=False)
+@zero_gpu
+def process_image(image, prompt, expand_prompt_enabled, resolution, num_inference_steps):
+    """
+    Process the input image and prompt.
+    This is a placeholder function - replace with your actual processing logic.
+    Args:
+        image: PIL Image or numpy array
+        prompt: str, the text input from the user
+        expand_prompt_enabled: bool, whether to expand the prompt via the model
+        resolution: int, the inference resolution
+        num_inference_steps: int, the number of inference steps
+    Returns:
+        Processed image
+    """
+    if image is None:
+        return None, None
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    logger.info(f"Original prompt: {prompt}")
+    if prompt != "" and expand_prompt_enabled:
+        prompt = expand_prompt(image, prompt)
+        logger.info(f"Expanded prompt: {prompt}")
+    num_inference_steps = int(num_inference_steps)
+    pred_mask = flowdis_predict(
+        image=image,
+        prompt=prompt,
+        models=models,
+        resolution=resolution,
+        num_inference_steps=num_inference_steps,
+        device=device,
+    )
+    blacked_image = Image.fromarray(np.array(image) * (np.array(pred_mask)[:, :, np.newaxis] > 0).astype(np.uint8))
+    transparent_png = Image.fromarray(np.dstack([blacked_image, np.array(pred_mask)]))
+    uid = uuid.uuid4().hex
+    png_path = TEMP_DIR / f"{uid}.png"
+    transparent_png.save(png_path)
+    return (
+        gr.update(value=[image, transparent_png], key=uid),
+        gr.update(value=str(png_path), interactive=True)
+    )
+# Load examples from assets/examples/examples.csv: image_name, prompt, resolution, num_steps
+_example_dir = Path(__file__).parent / "assets" / "examples"
+_examples_csv = _example_dir / "examples.csv"
+examples = []
+if _examples_csv.exists():
+    with open(_examples_csv, newline="", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            image_path = str(_example_dir / row["image_name"].strip())
+            examples.append([
+                image_path,
+                row["prompt"].strip(),
+                True,  # expand prompt (default for examples)
+                int(row["resolution"].strip()),
+                int(row["num_steps"].strip()),
+            ])
+_head_js = """
+<style>
+#expand-prompt.is-disabled { pointer-events: none !important; }
+#expand-prompt.is-disabled label,
+#expand-prompt.is-disabled input,
+#expand-prompt.is-disabled .info { opacity: 0.4 !important; }
+/* Hide the "Expand prompt" column (3rd) in the examples table */
+#examples-table table th:nth-child(3),
+#examples-table table td:nth-child(3) { display: none !important; }
+</style>
+<script>
+(function() {
+    function findEls() {
+        return {
+            ta: document.querySelector('#text-prompt textarea, #text-prompt input'),
+            cb: document.querySelector('#expand-prompt'),
+        };
+    }
+    function syncFromText() {
+        var els = findEls();
+        if (!els.ta || !els.cb) return;
+        var empty = !els.ta.value.trim();
+        els.cb.classList.toggle('is-disabled', empty);
+        var input = els.cb.querySelector('input[type=checkbox]');
+        if (input) input.disabled = empty;
+    }
+    function init() {
+        var els = findEls();
+        if (!els.ta || !els.cb) { setTimeout(init, 200); return; }
+        els.ta.addEventListener('input', syncFromText);
+        els.ta.addEventListener('change', syncFromText);
+        // Catch programmatic value changes (e.g. example selection)
+        var lastVal = els.ta.value;
+        setInterval(function() {
+            if (els.ta.value !== lastVal) { lastVal = els.ta.value; syncFromText(); }
+        }, 250);
+        syncFromText();
+    }
+    if (document.readyState === 'loading')
+        document.addEventListener('DOMContentLoaded', init);
+    else
+        init();
+})();
+</script>
+<script>
+(function() {
+    function findEls() {
+        return {
+            ta: document.querySelector('#text-prompt textarea, #text-prompt input'),
+            cb: document.querySelector('#expand-prompt'),
+        };
+    }
+    function syncFromText() {
+        var els = findEls();
+        if (!els.ta || !els.cb) return;
+        var empty = !els.ta.value.trim();
+        els.cb.classList.toggle('is-disabled', empty);
+        var input = els.cb.querySelector('input[type=checkbox]');
+        if (input) input.disabled = empty;
+    }
+    function init() {
+        var els = findEls();
+        if (!els.ta || !els.cb) { setTimeout(init, 200); return; }
+        els.ta.addEventListener('input', syncFromText);
+        els.ta.addEventListener('change', syncFromText);
+        // Catch programmatic value changes (e.g. example selection)
+        var lastVal = els.ta.value;
+        setInterval(function() {
+            if (els.ta.value !== lastVal) { lastVal = els.ta.value; syncFromText(); }
+        }, 250);
+        syncFromText();
+    }
+    if (document.readyState === 'loading')
+        document.addEventListener('DOMContentLoaded', init);
+    else
+        init();
+})();
+</script>
+"""
+with gr.Blocks(
+    title="FlowDIS – Precise Background Removal",
+    head=_head_js,
+    theme=gr.themes.Default(
+        font=gr.themes.GoogleFont("Inter"),
+    ).set(
+        button_primary_background_fill="#C209C1",
+        button_primary_background_fill_dark="#C209C1",
+        button_primary_background_fill_hover="#d63bd5",
+        button_primary_background_fill_hover_dark="#d63bd5",
+        button_primary_text_color="#ffffff",
+        button_primary_text_color_dark="#ffffff",
+    ),
+    delete_cache=(1800, 1800)
+) as demo:
+    gr.HTML(
+        """
+       <style>
+        /* Theme-adaptive tokens */
+        :root {
+        --flow-text: #0f172a;          /* slate-900 */
+        --flow-muted: #475569;         /* slate-600 */
+        --flow-link: #2563eb;          /* blue-600 */
+        --flow-link-hover: #1d4ed8;    /* blue-700 */
+        --flow-title: #C209C1;         /* Picsart pink */
+        }
+        @media (prefers-color-scheme: dark) {
+        :root {
+            --flow-text: #f1f5f9;        /* slate-100 */
+            --flow-muted: #94a3b8;       /* slate-400 */
+            --flow-link: #60a5fa;        /* blue-400 */
+            --flow-link-hover: #93c5fd;  /* blue-300 */
+            --flow-title: #e45fe3;       /* Picsart pink (lighter for dark mode) */
+        }
+        }
+        .flow-header {
+        text-align: center;
+        max-width: 900px;
+        margin: 18px auto 12px auto;
+        font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+        }
+        .flow-title {
+        font-size: 1.9rem;
+        font-weight: 750;
+        letter-spacing: -0.3px;
+        margin-bottom: 4px;
+        color: var(--flow-title);      /* title accent (needle stays as-is) */
+        }
+        .flow-links {
+        margin-bottom: 8px;
+        }
+        .flow-links a {
+        color: var(--flow-link);       /* cool blue links */
+        font-weight: 600;
+        text-decoration: none;
+        margin: 0 0px;
+        font-size: 0.95rem;
+        transition: color 0.2s ease, text-shadow 0.2s ease;
+        }
+        .flow-links a:hover {
+        color: var(--flow-link-hover);
+        text-shadow: 0 0 10px rgba(37, 99, 235, 0.25);
+        }
+        @media (prefers-color-scheme: dark) {
+        .flow-links a:hover {
+            text-shadow: 0 0 12px rgba(147, 197, 253, 0.35);
+        }
+        }
+        .flow-desc {
+        font-size: 0.95rem;
+        color: var(--flow-muted);
+        max-width: 650px;
+        margin: 0 auto;
+        line-height: 1.5;
+        }
+        .bg-btn-row { display: flex; gap: 6px; overflow-x: auto; scrollbar-width: thin; }
+        .bg-btn {
+            width: 42px !important; height: 42px !important;
+            border: 2.5px solid #aaa !important; border-radius: 8px !important;
+            cursor: pointer !important; flex-shrink: 0 !important;
+            padding: 0 !important; outline: none !important;
+            transition: transform 0.15s ease, box-shadow 0.15s ease,
+                        border-color 0.15s ease, filter 0.15s ease;
+        }
+        .bg-btn:hover {
+            transform: scale(1.15);
+            border-color: #333 !important;
+            box-shadow: 0 3px 10px rgba(0,0,0,0.4);
+            filter: brightness(1.15);
+        }
+        .bg-btn:active {
+            transform: scale(0.95);
+        }
+        @media (max-width: 1024px) {
+            #main-row {
+                flex-direction: column !important;
+                flex-wrap: wrap !important;
+            }
+            #main-row > * {
+                width: 100% !important;
+                flex: 1 1 100% !important;
+                min-width: 0 !important;
+            }
+        }
+        @media (max-width: 500px) {
+            #input-image { height: 400px !important; }
+        }
+        @media (max-width: 400px) {
+            #input-image { height: 300px !important; }
+        }
+        .prose :is(label span, .info) { font-weight: 400 !important; }
+        </style>
+        <div class="flow-header">
+        <div class="flow-title"><span style="color:#C209C1">✦</span> FlowDIS Demo</div>
+        <div class="flow-links">
+            <span>📄</span><a href="https://arxiv.org/" target="_blank" rel="noopener noreferrer">arXiv</a>
+            <span>💻</span><a href="https://github.com/Picsart-AI-Research/FlowDIS" target="_blank" rel="noopener noreferrer">Code</a>
+        </div>
+        <div class="flow-desc">
+            FlowDIS performs precise foreground segmentation, optionally guided by a text prompt to only preserve the specified objects.
+        </div>
+        </div>
+        """
+    )
+    with gr.Row(elem_id="main-row"):
+        # Left column: Input image, text field, and submit button
+        with gr.Column(scale=1):
+            input_image = gr.Image(
+                label="Input Image",
+                type="pil",
+                height=500,
+                elem_id="input-image",
+            )
+            text_input = gr.Textbox(
+                label="Text Prompt (Optional)",
+                placeholder="Enter what you want to retain...",
+                lines=1,
+                elem_id="text-prompt",
+            )
+            expand_prompt_check = gr.Checkbox(
+                label="Expand prompt",
+                value=True,
+                elem_id="expand-prompt",
+                info="Use Qwen3-VL-4B-Instruct model to expand the prompt for better text-guided segmentation.",
+            )
+            # Sliders for resolution and steps
+            with gr.Row():
+                with gr.Column(scale=1, min_width=300):
+                    resolution_slider = gr.Slider(
+                        minimum=1024,
+                        maximum=2048,
+                        value=1536,
+                        step=64,
+                        label="Inference Resolution",
+                        info="Higher resolution preserves more details.",
+                    )
+                with gr.Column(scale=1, min_width=300):
+                    steps_slider = gr.Slider(
+                        minimum=1,
+                        maximum=12,
+                        value=4,
+                        step=1,
+                        label="Number of Steps",
+                        info="More steps generate sharper results.",
+                    )
+            submit_btn = gr.Button("🚀 Remove Background", variant="primary")
+        # Right column: Output image
+        with gr.Column(scale=1):
+            output_image = gr.ImageSlider(
+                label="FlowDIS prediction",
+                type="pil",
+                format="webp",
+                height=500,
+                slider_position=10,
+                elem_id="output-slider",
+            )
+            _checker = "repeating-conic-gradient(#ccc 0% 25%,#fff 0% 50%) 50%/12px 12px"
+            _bg_buttons = [
+                (_checker, _checker),
+                ("#ffffff", "#ffffff"),
+                ("#000000", "#000000"),
+                ("#00ff00", "#00ff00"),
+                ("#0000ff", "#0000ff"),
+                ("#ff0000", "#ff0000"),
+                ("#ffff00", "#ffff00"),
+                ("#ff00ff", "#ff00ff"),
+                ("#00ffff", "#00ffff"),
+            ]
+            _onclick = (
+                "var s=document.getElementById('slider-bg-style');"
+                "if(!s){s=document.createElement('style');"
+                "s.id='slider-bg-style';document.head.appendChild(s);}"
+                "s.textContent='#output-slider img,#output-slider canvas"
+                "{background:'+this.dataset.bg+' !important}';"
+            )
+            gr.HTML(
+                value='<div class="bg-btn-row">'
+                + "".join(
+                    f'<button class="bg-btn" style="background:{style}"'
+                    f' data-bg="{bg}" onclick="{_onclick}"></button>'
+                    for style, bg in _bg_buttons
+                )
+                + "</div>"
+            )
+            download_btn = gr.DownloadButton(
+                label="📥 Download PNG",
+                variant="primary",
+                interactive=False
+            )
+    # Connect the submit button to the processing function
+    submit_btn.click(
+        disable_download_btn,
+        outputs=download_btn
+    ).then(
+        fn=process_image,
+        inputs=[input_image, text_input, expand_prompt_check, resolution_slider, steps_slider],
+        outputs=[output_image, download_btn]
+    )
+    # Optional: Also trigger on text input enter key
+    text_input.submit(
+        disable_download_btn,
+        outputs=download_btn
+    ).then(
+        fn=process_image,
+        inputs=[input_image, text_input, expand_prompt_check, resolution_slider, steps_slider],
+        outputs=[output_image, download_btn],
+    )
+    examples_component = gr.Examples(
+        examples=examples,
+        inputs=[input_image, text_input, expand_prompt_check, resolution_slider, steps_slider],
+        label="Examples",
+        elem_id="examples-table",
+    )
+    examples_component.dataset.click(
+        disable_download_btn,
+        outputs=download_btn
+    ).then(
+        process_image,
+        inputs=[input_image, text_input, expand_prompt_check, resolution_slider, steps_slider],
+        outputs=[output_image, download_btn],
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.queue(max_size=20)
+    if IS_HF_SPACE:
+        demo.launch(allowed_paths=[str(TEMP_DIR), "assets"])
+    else:
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=True,
+            allowed_paths=[str(TEMP_DIR), "assets"],
+        )

assets/examples/0.jpg ADDED Viewed

Git LFS Details

SHA256: 3c3404d60a4de67e2764479266bf6b7130ba1169d3fe0f37acebab5a62f8b16c
Pointer size: 131 Bytes
Size of remote file: 359 kB

assets/examples/1.jpg ADDED Viewed

Git LFS Details

SHA256: 51809ee1a7e27bbbbde5a0325723895fd2592e70dda327e2a06795bf7d98c0cf
Pointer size: 132 Bytes
Size of remote file: 1.09 MB

assets/examples/2.png ADDED Viewed

Git LFS Details

SHA256: 4d17b0f354a2c72baedc68137057e384ca68812b05d7070f4cfd4bb0ea8f49f5
Pointer size: 132 Bytes
Size of remote file: 2.59 MB

assets/examples/3.jpg ADDED Viewed

Git LFS Details

SHA256: 7bc8ac73466b7af3eb5f0ea9b7bb99b7cbcd541532641d7d56e5106ed3282cd3
Pointer size: 132 Bytes
Size of remote file: 2.41 MB

assets/examples/4.jpg ADDED Viewed

Git LFS Details

SHA256: da7bab1e177dbd5dd3aaf1cb8d5e7040d84b8ff3f2e844860735510e63eb8b66
Pointer size: 131 Bytes
Size of remote file: 890 kB

assets/examples/5.jpg ADDED Viewed

Git LFS Details

SHA256: 5c35568c50cc28582379689dc79382b61dcb7bd0352f5345248c50f641c48926
Pointer size: 131 Bytes
Size of remote file: 998 kB

assets/examples/6.jpg ADDED Viewed

Git LFS Details

SHA256: 4a27f6ccf014f0c2e073eda7bb148cf7c70eb40ff4e7027f69d3e34d63854bca
Pointer size: 132 Bytes
Size of remote file: 1.4 MB

assets/examples/examples.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+image_name,prompt,resolution,num_steps
+0.jpg,,2048,8
+1.jpg,,2048,8
+2.png,,1536,4
+3.jpg,,1536,2
+4.jpg,measuring tape,2048,8
+5.jpg,white shoes,1280,2
+6.jpg,bicycle,2048,8

assets/examples/prompts.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "0.jpg": "",
+    "1.jpg": "",
+    "2.png": "",
+    "3.jpg": "",
+    "4.jpg": "measuring tape",
+    "5.jpg": "white shoes",
+    "6.jpg": "bicycle"
+}

assets/preview.png ADDED Viewed

Git LFS Details

SHA256: 6d640682e2c07159c5d43c530948dfcf87a036ae20b21f1f2a25395db45daf74
Pointer size: 131 Bytes
Size of remote file: 293 kB

flowdis/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""FlowDIS: Language-Guided Dichotomous Image Segmentation with Flow Matching"""
+from flowdis.configs import configs
+from flowdis.loaders import load_autoencoder, load_clip, load_t5, load_transformer
+from flowdis.sampling import flowdis_predict
+from flowdis.util import load_models
+__all__ = [
+    "configs",
+    "load_autoencoder",
+    "load_clip",
+    "load_t5",
+    "load_transformer",
+    "flowdis_predict",
+    "load_models",
+]

flowdis/autoencoder.py ADDED Viewed

	@@ -0,0 +1,318 @@

+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+@dataclass
+class AutoEncoderParams:
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # get dtype for proper tracing
+        upscale_dtype = next(self.up.parameters()).dtype
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # cast to proper dtype
+        h = h.to(upscale_dtype)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.params = params
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))

flowdis/conditioner.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from torch import Tensor, nn
+from transformers import (
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5Config,
+    T5EncoderModel,
+    T5Tokenizer,
+)
+class HFEmbedder(nn.Module):
+    def __init__(self, version: str, max_length: int, is_clip: bool, **hf_kwargs):
+        super().__init__()
+        self.is_clip = is_clip
+        self.max_length = max_length
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        if self.is_clip:
+            self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(version, max_length=max_length)
+            config = CLIPTextConfig.from_pretrained(version, **hf_kwargs)
+            self.hf_module: CLIPTextModel = CLIPTextModel._from_config(config)
+        else:
+            self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(version, max_length=max_length, legacy=True)
+            config = T5Config.from_pretrained(version, **hf_kwargs)
+            self.hf_module: T5EncoderModel = T5EncoderModel._from_config(config)
+    def forward(self, text: list[str]) -> Tensor:
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]

flowdis/configs.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from flowdis.autoencoder import AutoEncoderParams
+from flowdis.model import FluxParams
+configs = {
+    "autoencoder": AutoEncoderParams(
+        resolution=256,
+        in_channels=3,
+        ch=128,
+        out_ch=3,
+        ch_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        z_channels=16,
+        scale_factor=0.3611,
+        shift_factor=0.1159,
+    ),
+    "flowdis": FluxParams(
+        in_channels=128,
+        out_channels=64,
+        vec_in_dim=768,
+        context_in_dim=4096,
+        hidden_size=3072,
+        mlp_ratio=4.0,
+        num_heads=24,
+        depth=19,
+        depth_single_blocks=38,
+        axes_dim=[16, 56, 56],
+        theta=10_000,
+        qkv_bias=True,
+        guidance_embed=False,
+    ),
+}

flowdis/layers.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import math
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+from flowdis.math import attention, rope
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+        self.dim = dim
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool = False,
+    ):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
+        return self._forward(img, txt, vec, pe)
+    def _forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        return self._forward(x, vec, pe)
+    def _forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        # compute attention
+        attn = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x

flowdis/loaders.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from safetensors.torch import load_file
+from flowdis.autoencoder import AutoEncoder
+from flowdis.conditioner import HFEmbedder
+from flowdis.configs import configs
+from flowdis.model import Flux, FluxParams
+def load_transformer(
+    model_name: str,
+    model_path: str,
+    device: str | torch.device = "cuda",
+    config: FluxParams = None,
+    state_dict: dict = None,
+) -> Flux:
+    with torch.device("meta"):
+        model = Flux(config if config else configs[model_name]).to(dtype=torch.bfloat16)
+    model.to_empty(device="cpu")
+    if state_dict is None:
+        if str(model_path).endswith(".safetensors"):
+            state_dict = load_file(model_path, device="cpu")
+        else:
+            state_dict = torch.load(model_path, map_location="cpu")
+    model.load_state_dict(state_dict, assign=True, strict=False)
+    model = model.to(device=device, dtype=torch.bfloat16)
+    return model.eval()
+def load_autoencoder(
+    model_path: str,
+    device: str | torch.device = "cuda"
+) -> AutoEncoder:
+    with torch.device("meta"):
+        ae = AutoEncoder(configs["autoencoder"])
+    ae.to_empty(device="cpu")
+    state_dict = load_file(model_path, device="cpu")
+    ae.load_state_dict(state_dict, assign=True, strict=False)
+    ae = ae.to(device=device, dtype=torch.bfloat16)
+    return ae.eval()
+def load_t5(
+    model_path: str,
+    max_length: int = 512,
+    device: str | torch.device = "cuda"
+) -> HFEmbedder:
+    with torch.device("meta"):
+        t5 = HFEmbedder(
+            model_path.parent,
+            max_length=max_length,
+            is_clip=False,
+            dtype=torch.bfloat16
+        )
+    t5.to_empty(device="cpu")
+    state_dict = load_file(model_path, device="cpu")
+    t5.load_state_dict(state_dict, assign=True, strict=False)
+    return t5.to(device=device, dtype=torch.bfloat16)
+def load_clip(
+    model_path: str,
+    device: str | torch.device = "cuda"
+) -> HFEmbedder:
+    clip = HFEmbedder(
+        model_path.parent,
+        max_length=77,
+        is_clip=True,
+        dtype=torch.bfloat16
+    )
+    state_dict = load_file(model_path, device="cpu")
+    clip.load_state_dict(state_dict, assign=True, strict=False)
+    return clip.to(device=device, dtype=torch.bfloat16)

flowdis/math.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+from einops import rearrange
+from torch import Tensor
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=pos.dtype, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)

flowdis/model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from dataclasses import dataclass
+import torch
+from torch import Tensor, nn
+from flowdis.layers import (
+    DoubleStreamBlock,
+    EmbedND,
+    LastLayer,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding,
+)
+@dataclass
+class FluxParams:
+    in_channels: int
+    out_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+    def __init__(self, params: FluxParams):
+        super().__init__()
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = params.out_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        )
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                ) for _ in range(params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                ) for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img

flowdis/sampling.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import math
+import torch
+import torchvision.transforms.functional as tvF
+from einops import rearrange, repeat
+from PIL import Image
+from scipy import stats
+from torch import Tensor
+from flowdis.model import Flux
+from flowdis.util import Models
+def unpack(x: Tensor, height: int, width: int) -> Tensor:
+    return rearrange(
+        x,
+        "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+        h=math.ceil(height / 16),
+        w=math.ceil(width / 16),
+        ph=2,
+        pw=2,
+    )
+def beta_scheduler(num_timesteps: int, alpha: float = 2.5, beta: float = 1.0) -> list[float]:
+    q = torch.linspace(1, 0, num_timesteps+1)
+    steps = stats.beta.ppf(q, alpha, beta).tolist()
+    if steps[-1] > 0.0:
+        steps.append(0.0)
+    return steps
+def prepare(
+    img: Tensor,
+    prompt: str | list[str],
+    models: Models,
+    device: str = "cuda"
+) -> dict[str, Tensor]:
+    # load and encode the conditioning image and the mask
+    bs, _, _, _ = img.shape
+    if bs == 1 and not isinstance(prompt, str):
+        bs = len(prompt)
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    with torch.no_grad():
+        img = models.ae.encode(img.to(device=device, dtype=torch.bfloat16))
+    h, w = img.shape[2], img.shape[3]
+    img_ids = torch.zeros(h // 2, w // 2, 3)
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
+    img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+    img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    if img.shape[0] == 1 and bs > 1:
+        img = repeat(img, "1 ... -> bs ...", bs=bs)
+    txt = models.t5(prompt)
+    if txt.shape[0] == 1 and bs > 1:
+        txt = repeat(txt, "1 ... -> bs ...", bs=bs)
+    txt_ids = torch.zeros(bs, txt.shape[1], 3)
+    vec = models.clip(prompt)
+    if vec.shape[0] == 1 and bs > 1:
+        vec = repeat(vec, "1 ... -> bs ...", bs=bs)
+    return_dict = {
+        "img": img,
+        "img_ids": img_ids.to(img.device),
+        "txt": txt.to(img.device),
+        "txt_ids": txt_ids.to(img.device),
+        "vec": vec.to(img.device),
+    }
+    return return_dict
+def solve_flowdis_ode(
+    model: Flux,
+    img: Tensor,
+    img_ids: Tensor,
+    txt: Tensor,
+    txt_ids: Tensor,
+    vec: Tensor,
+    num_inference_steps: int,
+):
+    zt = img
+    timesteps = beta_scheduler(num_inference_steps)
+    for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
+        t_vec = torch.full((zt.shape[0],), t_curr, dtype=zt.dtype, device=zt.device)
+        pred = model(
+            img=torch.cat((zt, img), dim=-1),
+            img_ids=img_ids,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+        )
+        zt = zt + (t_prev - t_curr) * pred
+    return zt
+@torch.no_grad()
+def flowdis_predict(
+    image: Tensor,
+    prompt: str | list[str],
+    models: Models,
+    resolution: int = 1024,
+    num_inference_steps: int = 2,
+    device: str = "cuda",
+):
+    image_orig = image.convert("RGB")
+    image = image.resize((resolution, resolution))
+    image_t = tvF.to_tensor(image).unsqueeze(0).to(device=device)
+    image_t = (image_t - 0.5) / 0.5
+    inp = prepare(image_t, prompt, models, device)
+    pred_mask_latent_t = solve_flowdis_ode(
+        models.transformer,
+        **inp,
+        num_inference_steps=num_inference_steps,
+    )
+    pred_mask_latent_t = unpack(pred_mask_latent_t.float(), resolution, resolution)
+    with torch.autocast(device_type=device, dtype=torch.bfloat16):
+        pred_mask_t = models.ae.decode(pred_mask_latent_t).clamp(-1, 1)
+    pred_mask_t = rearrange(pred_mask_t[0], "c h w -> h w c")
+    pred_mask_np = (127.5 * (pred_mask_t + 1.0)).mean(dim=-1).cpu().byte().numpy()
+    pred_mask = Image.fromarray(pred_mask_np).convert("L")
+    pred_mask = pred_mask.resize(image_orig.size)
+    return pred_mask

flowdis/util.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import logging
+from copy import deepcopy
+from dataclasses import dataclass
+from pathlib import Path
+import torch
+import numpy as np
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file
+from flowdis.autoencoder import AutoEncoder
+from flowdis.conditioner import HFEmbedder
+from flowdis.configs import configs
+from flowdis.loaders import load_autoencoder, load_clip, load_t5, load_transformer
+from flowdis.model import Flux
+logger = logging.getLogger(__name__)
+@dataclass
+class Models:
+    clip: HFEmbedder
+    t5: HFEmbedder
+    ae: AutoEncoder
+    transformer: Flux
+def load_models(
+    root_model_dir: Path = None,
+    device: str | torch.device = "cuda"
+) -> Models:
+    """
+    Load the models for the FlowDIS pipeline.
+    Args:
+        root_model_dir: The root model directory.
+            If None, the models are downloaded from the Hugging Face Hub.
+        device: The device to load the models on.
+    Returns:
+        Models: The loaded models.
+    """
+    if root_model_dir is None:
+        root_model_dir = download_from_hf_hub("PAIR/FlowDIS")
+    logger.info("Loading T5.")
+    t5 = load_t5(
+        model_path=root_model_dir / "t5-v1_1-xxl" / "model.safetensors",
+        device=device,
+        max_length=512
+    )
+    logger.info("Loading CLIP.")
+    clip = load_clip(
+        model_path=root_model_dir / "clip-vit-large-patch14" / "model.safetensors",
+        device=device
+    )
+    logger.info("Loading AE.")
+    ae = load_autoencoder(
+        model_path=root_model_dir / "ae.safetensors",
+        device=device
+    )
+    logger.info("Loading Transformer.")
+    model = load_transformer(
+        model_name="flowdis",
+        model_path=root_model_dir / "flowdis-transformer.safetensors",
+        device=device,
+    )
+    logger.info("All models loaded.")
+    return Models(
+        clip=clip,
+        t5=t5,
+        ae=ae,
+        transformer=model,
+    )
+def download_from_hf_hub(
+    repo_id: str,
+    cache_dir: str | Path | None = None,
+    revision: str | None = None,
+) -> Path:
+    """
+    Download a FlowDIS model repository from the Hugging Face Hub.
+    Args:
+        repo_id: The Hugging Face Hub repo id (e.g. "PAIR/FlowDIS").
+        cache_dir: Optional cache directory. Defaults to the huggingface_hub
+            default (typically ~/.cache/huggingface/hub).
+        revision: Optional git revision (branch, tag, or commit SHA).
+    Returns:
+        Path to the local directory containing the downloaded snapshot. The
+        directory layout matches the repo layout on the Hub, so it can be
+        passed directly to `load_models` as `root_model_dir`.
+    """
+    logger.info(f"Downloading {repo_id} from Hugging Face Hub.")
+    local_dir = snapshot_download(
+        repo_id=repo_id,
+        cache_dir=cache_dir,
+        revision=revision,
+    )
+    logger.info(f"Snapshot available at {local_dir}.")
+    return Path(local_dir)
+def green_screen(img: np.ndarray, mask: np.ndarray) -> np.ndarray:
+    img_np = np.array(img)
+    mask = (np.array(mask) / 255)[:, :, np.newaxis].repeat(3, axis=2)
+    combined = img_np * mask + (1-mask) * np.array([0, 255, 0], dtype=np.uint8)
+    combined = combined.astype(np.uint8)
+    return combined

pyproject.toml ADDED Viewed

	@@ -0,0 +1,50 @@

+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "flowdis"
+version = "0.1.0"
+description = "FlowDIS: Language-Guided Dichotomous Image Segmentation with Flow Matching"
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+authors = [
+    { name = "Andranik Sargsyan" },
+    { name = "Shant Navasardyan" },
+]
+keywords = ["segmentation", "flow-matching", "background removal", "deep-learning"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Image Recognition",
+]
+dependencies = [
+    "accelerate>=1.12.0,<2.0",
+    "einops>=0.8.2,<1.0",
+    "gradio==6.3.0",
+    "numpy>=1.24.0,<2.0",
+    "opencv-python>=4.11.0,<5.0",
+    "Pillow>=10.0.0,<11.0",
+    "safetensors>=0.7.0,<1.0",
+    "scipy>=1.17.1,<2.0",
+    "sentencepiece>=0.2.1,<1.0",
+    "tiktoken>=0.12.0,<1.0",
+    "torch>=2.8.0,<=2.10",
+    "torchvision>=0.25.0",
+    "transformers>=4.39.0,<5.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0",
+    "ruff>=0.1.0",
+]
+[tool.setuptools]
+packages = ["flowdis"]

qwen.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import logging
+import torch
+from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+from PIL import Image
+logger = logging.getLogger(__name__)
+# Load model if GPU is available
+model = None
+processor = None
+if torch.cuda.is_available():
+    logger.info("Loading Qwen3VL model.")
+    model = Qwen3VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen3-VL-4B-Instruct",
+        dtype=torch.bfloat16,
+        device_map="auto"
+    )
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-4B-Instruct")
+    logger.info("Qwen3VL model loaded.")
+else:
+    logger.info("Qwen3VL was not loaded because no GPU is available.")
+def expand_prompt(image: Image.Image, user_prompt: str) -> str:
+    """
+    Expand the user prompt using the Qwen3VL model.
+    Args:
+        image: The image to use for the prompt expansion.
+        user_prompt: The user prompt to expand.
+    Returns:
+        The expanded prompt.
+    """
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": f"Describe the {user_prompt} in this image with a short prompt. Don't use surrounding objects in the description. Also don't describe the background, like what it is sitting on or what it is on top of, etc..."}
+            ]
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    inputs = processor(
+        text=[text],
+        images=[image],
+        padding=True,
+        return_tensors="pt"
+    )
+    inputs = inputs.to(model.device)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=512
+        )
+    generated_ids_trimmed = generated_ids[:, inputs["input_ids"].shape[1]:]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True
+    )[0]
+    return output_text

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+accelerate>=1.12.0,<2.0
+einops>=0.8.2,<1.0
+gradio==6.3.0
+numpy>=1.24.0,<2.0
+opencv-python>=4.11.0,<5.0
+Pillow>=10.0.0,<11.0
+safetensors>=0.7.0,<1.0
+scipy>=1.17.1,<2.0
+sentencepiece>=0.2.1,<1.0
+tiktoken>=0.12.0,<1.0
+torch>=2.8.0,<=2.10
+torchvision>=0.25.0
+transformers>=4.39.0,<5.0