Qwen-Image-to-LoRA

Running on Zero

App Files Files Community

Alexander Bagus commited on 2 days ago

Commit

371c0ec

1 Parent(s): bb2d84c

22

Browse files

Files changed (2) hide show

app.py +51 -50
examples/0_examples.json +7 -6

app.py CHANGED Viewed

@@ -1,65 +1,47 @@
 import gradio as gr
 import numpy as np
 import torch, random, json, spaces, time
-# from safetensors.torch import load_file
-# from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler
-# from videox_fun.pipeline import ZImageControlPipeline
-# from videox_fun.models import ZImageControlTransformer2DModel
-# from transformers import AutoTokenizer, Qwen3ForCausalLM
-# from diffusers import AutoencoderKL
-# from controlnet_aux.processor import Processor
-from utils import repo_utils, image_utils, prompt_utils
-repo_utils.clone_repo_if_not_exists("https://github.com/apple/ml-starflow.git", "app/models")
-repo_utils.clone_repo_if_not_exists("https://huggingface.co/apple/starflow", "app/models")
-# MODEL_PATH = "models/Z-Image-Turbo/"
-# CONTROLNET_PATH = "models/Z-Image-Turbo-Fun-Controlnet-Union/Z-Image-Turbo-Fun-Controlnet-Union.safetensors"
 DTYPE = torch.bfloat16
 MAX_SEED = np.iinfo(np.int32).max
-# # load transformer
-# transformer = ZImageControlTransformer2DModel.from_pretrained(
-#     MODEL_PATH,
-#     subfolder="transformer",
-#     transformer_additional_kwargs={
-#         "control_layers_places": [0, 5, 10, 15, 20, 25],
-#         "control_in_dim": 16
-#     },
-#     torch_dtype= DTYPE
-# ).to("cuda")
-# ## Load controlnet
-# state_dict = load_file(CONTROLNET_PATH)
-# state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict
-# m, u = transformer.load_state_dict(state_dict, strict=False)
-# print(f"missing keys: {len(m)}, unexpected keys: {len(u)}")
-# # load ZImageControlPipeline
-# vae = AutoencoderKL.from_pretrained(
-#     MODEL_PATH,
-#     subfolder="vae",
-#     device_map="cuda",
-#     torch_dtype= DTYPE
-# )
-# tokenizer = AutoTokenizer.from_pretrained(
-#     MODEL_PATH,
-#     subfolder="tokenizer"
-# )
-# text_encoder = Qwen3ForCausalLM.from_pretrained(
-#     MODEL_PATH,
-#     subfolder="text_encoder",
-#     torch_dtype=DTYPE,
-# )
-# scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
-#     MODEL_PATH,
-#     subfolder="scheduler"
-# )
 # pipe = ZImageControlPipeline(
 #     vae=vae,
@@ -90,6 +72,25 @@ def inference(
     timestamp = time.time()
     print(f"timestamp: {timestamp}")
 #     # process image
 #     print("DEBUG: process image")
 #     if input_image is None:

 import gradio as gr
 import numpy as np
 import torch, random, json, spaces, time
+from diffsynth.pipelines.qwen_image import (
+    QwenImagePipeline, ModelConfig,
+    QwenImageUnit_Image2LoRAEncode, QwenImageUnit_Image2LoRADecode
+)
+from safetensors.torch import save_file
+import torch
+from PIL import Image
+# from utils import repo_utils, image_utils, prompt_utils
+# repo_utils.clone_repo_if_not_exists("https://github.com/apple/ml-starflow.git", "app/models")
+# repo_utils.clone_repo_if_not_exists("https://huggingface.co/apple/starflow", "app/models")
 DTYPE = torch.bfloat16
 MAX_SEED = np.iinfo(np.int32).max
+vram_config_disk_offload = {
+    "offload_dtype": "disk",
+    "offload_device": "disk",
+    "onload_dtype": "disk",
+    "onload_device": "disk",
+    "preparing_dtype": torch.bfloat16,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+# Load models
+pipe = QwenImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="SigLIP2-G384/model.safetensors", **vram_config_disk_offload),
+        ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="DINOv3-7B/model.safetensors", **vram_config_disk_offload),
+        ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-i2L", origin_file_pattern="Qwen-Image-i2L-Style.safetensors", **vram_config_disk_offload),
+    ],
+    processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"),
+    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
 # pipe = ZImageControlPipeline(
 #     vae=vae,
     timestamp = time.time()
     print(f"timestamp: {timestamp}")
+    # Load images
+    images = [
+        Image.open("examples/style/1/0.jpg"),
+        Image.open("examples/style/1/1.jpg"),
+        Image.open("examples/style/1/2.jpg"),
+        Image.open("examples/style/1/3.jpg"),
+        Image.open("examples/style/1/4.jpg"),
+    ]
+    # Model inference
+    with torch.no_grad():
+        embs = QwenImageUnit_Image2LoRAEncode().process(pipe, image2lora_images=images)
+        lora = QwenImageUnit_Image2LoRADecode().process(pipe, **embs)["lora"]
+    save_file(lora, "model_style.safetensors")
+    return True
 #     # process image
 #     print("DEBUG: process image")
 #     if input_image is None:

examples/0_examples.json CHANGED Viewed

@@ -1,7 +1,8 @@
- [
-	["examples/pose2.jpg", "Woman wearing jeans and tanktop, from dubai", "HED"],
-	["examples/bottle.jpg", "A man holding a bottle", "HED"],
-	["examples/room.jpg", "modern architecture, living room", "Depth"],
-	["examples/pose1.jpg", "A female paladin. Mountain background.", "Pose"],
-	["examples/bird.jpg", "A bird sitting on a branch, cartoon.", "Canny"]
 ]

+[
+	"Saitama, punching",
+	"Levi Ackerman, kick",
+	"Goku, powering up",
+	"Light Yagami, writing",
+	"Naruto, eating ramen",
+	"Luffy, grinning"
 ]