Spaces:
Runtime error
Runtime error
lzyhha
commited on
Commit
·
b4faa43
1
Parent(s):
391f6e1
test
Browse files- app.py +22 -16
- demo_tasks/gradio_tasks_unseen.py +1 -1
- visualcloze.py +19 -17
app.py
CHANGED
|
@@ -17,6 +17,10 @@ default_steps = 30
|
|
| 17 |
|
| 18 |
GUIDANCE = """
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
## 📋 Quick Start Guide:
|
| 21 |
1. Adjust **Number of In-context Examples**, 0 disables in-context learning.
|
| 22 |
2. Set **Task Columns**, the number of images involved in a task.
|
|
@@ -24,16 +28,18 @@ GUIDANCE = """
|
|
| 24 |
4. Click **Generate** to create the images.
|
| 25 |
5. Parameters can be fine-tuned under **Advanced Options**.
|
| 26 |
|
| 27 |
-
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
<div style='font-size: 20px; '> 📧 Need help or have questions? Contact us at: lizhongyu [AT] mail.nankai.edu.cn</div>
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
When generating an image with the resoluation of 1024,
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 37 |
|
| 38 |
"""
|
| 39 |
|
|
@@ -90,9 +96,7 @@ def create_demo(model):
|
|
| 90 |
for i in range(max_grid_h):
|
| 91 |
# Add row label before each row
|
| 92 |
row_texts.append(gr.Markdown(
|
| 93 |
-
"
|
| 94 |
-
("query" if i == default_grid_h - 1 else f"In-context Example {i + 1}") +
|
| 95 |
-
"</div>",
|
| 96 |
elem_id=f"row_text_{i}",
|
| 97 |
visible=i < default_grid_h
|
| 98 |
))
|
|
@@ -297,9 +301,7 @@ def create_demo(model):
|
|
| 297 |
gr.update(
|
| 298 |
elem_id=f"row_text_{i}",
|
| 299 |
visible=i < actual_h,
|
| 300 |
-
value="
|
| 301 |
-
("Query" if i == actual_h - 1 else f"In-context Example {i + 1}") +
|
| 302 |
-
"</div>",
|
| 303 |
)
|
| 304 |
)
|
| 305 |
|
|
@@ -314,6 +316,9 @@ def create_demo(model):
|
|
| 314 |
images.append([])
|
| 315 |
for j in range(model.grid_w):
|
| 316 |
images[i].append(inputs[i * max_grid_w + j])
|
|
|
|
|
|
|
|
|
|
| 317 |
seed, cfg, steps, upsampling_steps, upsampling_noise, layout_text, task_text, content_text = inputs[-8:]
|
| 318 |
|
| 319 |
results = generate(
|
|
@@ -489,7 +494,7 @@ def parse_args():
|
|
| 489 |
if __name__ == "__main__":
|
| 490 |
args = parse_args()
|
| 491 |
|
| 492 |
-
snapshot_download(repo_id="VisualCloze/VisualCloze", repo_type="model", local_dir="models")
|
| 493 |
|
| 494 |
# Initialize model
|
| 495 |
model = VisualClozeModel(resolution=args.resolution, model_path=args.model_path, precision=args.precision)
|
|
@@ -498,4 +503,5 @@ if __name__ == "__main__":
|
|
| 498 |
demo = create_demo(model)
|
| 499 |
|
| 500 |
# Start Gradio server
|
| 501 |
-
demo.launch()
|
|
|
|
|
|
| 17 |
|
| 18 |
GUIDANCE = """
|
| 19 |
|
| 20 |
+
|
| 21 |
+
## 📧 Contact:
|
| 22 |
+
Need help or have questions? Contact us at: lizhongyu [AT] mail.nankai.edu.cn.
|
| 23 |
+
|
| 24 |
## 📋 Quick Start Guide:
|
| 25 |
1. Adjust **Number of In-context Examples**, 0 disables in-context learning.
|
| 26 |
2. Set **Task Columns**, the number of images involved in a task.
|
|
|
|
| 28 |
4. Click **Generate** to create the images.
|
| 29 |
5. Parameters can be fine-tuned under **Advanced Options**.
|
| 30 |
|
| 31 |
+
## 🔥 Task Examples:
|
| 32 |
+
Click the task button in the right bottom to acquire **examples** of various tasks.
|
| 33 |
+
Make sure all images and prompts are loaded before clicking the generate button.
|
| 34 |
|
|
|
|
| 35 |
|
| 36 |
+
## 💻 Runtime on the Zero GPU:
|
| 37 |
+
The runtime on the Zero GPU runtime depends on the size of the image grid.
|
| 38 |
+
When generating an image with the resoluation of 1024,
|
| 39 |
+
the runtime is approximately **[45s for a 2x2 grid], [55s for a 2x3 grid], [70s for a 3x3 grid], [90s for a 3x4 grid]**.
|
| 40 |
+
When generating three images in a 3x4 grid, i.e., Image to Depth + Normal + Hed,
|
| 41 |
+
the runtime is approximately **110s**.
|
| 42 |
+
**Deploying locally with an 80G A100 can reduce the runtime by more than half.**
|
| 43 |
|
| 44 |
"""
|
| 45 |
|
|
|
|
| 96 |
for i in range(max_grid_h):
|
| 97 |
# Add row label before each row
|
| 98 |
row_texts.append(gr.Markdown(
|
| 99 |
+
"## Query" if i == default_grid_h - 1 else f"## In-context Example {i + 1}",
|
|
|
|
|
|
|
| 100 |
elem_id=f"row_text_{i}",
|
| 101 |
visible=i < default_grid_h
|
| 102 |
))
|
|
|
|
| 301 |
gr.update(
|
| 302 |
elem_id=f"row_text_{i}",
|
| 303 |
visible=i < actual_h,
|
| 304 |
+
value="## Query" if i == actual_h - 1 else f"## In-context Example {i + 1}",
|
|
|
|
|
|
|
| 305 |
)
|
| 306 |
)
|
| 307 |
|
|
|
|
| 316 |
images.append([])
|
| 317 |
for j in range(model.grid_w):
|
| 318 |
images[i].append(inputs[i * max_grid_w + j])
|
| 319 |
+
if i != model.grid_h - 1:
|
| 320 |
+
if inputs[i * max_grid_w + j] is None:
|
| 321 |
+
raise gr.Error('Please upload in-context examples.')
|
| 322 |
seed, cfg, steps, upsampling_steps, upsampling_noise, layout_text, task_text, content_text = inputs[-8:]
|
| 323 |
|
| 324 |
results = generate(
|
|
|
|
| 494 |
if __name__ == "__main__":
|
| 495 |
args = parse_args()
|
| 496 |
|
| 497 |
+
# snapshot_download(repo_id="VisualCloze/VisualCloze", repo_type="model", local_dir="models")
|
| 498 |
|
| 499 |
# Initialize model
|
| 500 |
model = VisualClozeModel(resolution=args.resolution, model_path=args.model_path, precision=args.precision)
|
|
|
|
| 503 |
demo = create_demo(model)
|
| 504 |
|
| 505 |
# Start Gradio server
|
| 506 |
+
demo.launch()
|
| 507 |
+
# demo.launch(share=False, server_port=10050, server_name="0.0.0.0")
|
demo_tasks/gradio_tasks_unseen.py
CHANGED
|
@@ -253,7 +253,7 @@ def process_unseen_tasks(x):
|
|
| 253 |
mask = task.get('mask', [0 for _ in range(grid_w - 1)] + [1])
|
| 254 |
layout_prompt = get_layout_instruction(grid_w, grid_h)
|
| 255 |
|
| 256 |
-
upsampling_noise =
|
| 257 |
steps = None
|
| 258 |
outputs = [mask, grid_h, grid_w, layout_prompt, task_prompt, content_prompt, upsampling_noise, steps] + rets
|
| 259 |
break
|
|
|
|
| 253 |
mask = task.get('mask', [0 for _ in range(grid_w - 1)] + [1])
|
| 254 |
layout_prompt = get_layout_instruction(grid_w, grid_h)
|
| 255 |
|
| 256 |
+
upsampling_noise = 0.7
|
| 257 |
steps = None
|
| 258 |
outputs = [mask, grid_h, grid_w, layout_prompt, task_prompt, content_prompt, upsampling_noise, steps] + rets
|
| 259 |
break
|
visualcloze.py
CHANGED
|
@@ -91,26 +91,26 @@ class VisualClozeModel:
|
|
| 91 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 92 |
self.dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[self.precision]
|
| 93 |
|
| 94 |
-
# Initialize model
|
| 95 |
-
print("Initializing model...")
|
| 96 |
-
self.model = load_flow_model(model_name, device=self.device, lora_rank=self.lora_rank)
|
| 97 |
|
| 98 |
-
# Initialize VAE
|
| 99 |
-
print("Initializing VAE...")
|
| 100 |
-
self.ae = AutoencoderKL.from_pretrained(f"black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=self.dtype).to(self.device)
|
| 101 |
-
self.ae.requires_grad_(False)
|
| 102 |
|
| 103 |
-
# Initialize text encoders
|
| 104 |
-
print("Initializing text encoders...")
|
| 105 |
-
self.t5 = load_t5(self.device, max_length=self.max_length)
|
| 106 |
-
self.clip = load_clip(self.device)
|
| 107 |
|
| 108 |
-
self.model.eval().to(self.device, dtype=self.dtype)
|
| 109 |
|
| 110 |
-
# Load model weights
|
| 111 |
-
ckpt = torch.load(model_path)
|
| 112 |
-
self.model.load_state_dict(ckpt, strict=False)
|
| 113 |
-
del ckpt
|
| 114 |
|
| 115 |
# Initialize sampler
|
| 116 |
transport = create_transport(
|
|
@@ -337,6 +337,8 @@ class VisualClozeModel:
|
|
| 337 |
processed_images.append(blank)
|
| 338 |
if i == grid_h - 1:
|
| 339 |
mask_position.append(1)
|
|
|
|
|
|
|
| 340 |
|
| 341 |
if len(mask_position) > 1 and sum(mask_position) > 1:
|
| 342 |
if target_size is None:
|
|
@@ -443,7 +445,7 @@ class VisualClozeModel:
|
|
| 443 |
if True: # images[i] is None:
|
| 444 |
cropped = output_images[-1].crop(((i - row_start) * ret_w // self.grid_w, 0, ((i - row_start) + 1) * ret_w // self.grid_w, ret_h))
|
| 445 |
ret.append(cropped)
|
| 446 |
-
if mask_position[i - row_start] and is_upsampling:
|
| 447 |
upsampled = self.upsampling(
|
| 448 |
cropped,
|
| 449 |
upsampling_size,
|
|
|
|
| 91 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 92 |
self.dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[self.precision]
|
| 93 |
|
| 94 |
+
# # Initialize model
|
| 95 |
+
# print("Initializing model...")
|
| 96 |
+
# self.model = load_flow_model(model_name, device=self.device, lora_rank=self.lora_rank)
|
| 97 |
|
| 98 |
+
# # Initialize VAE
|
| 99 |
+
# print("Initializing VAE...")
|
| 100 |
+
# self.ae = AutoencoderKL.from_pretrained(f"black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=self.dtype).to(self.device)
|
| 101 |
+
# self.ae.requires_grad_(False)
|
| 102 |
|
| 103 |
+
# # Initialize text encoders
|
| 104 |
+
# print("Initializing text encoders...")
|
| 105 |
+
# self.t5 = load_t5(self.device, max_length=self.max_length)
|
| 106 |
+
# self.clip = load_clip(self.device)
|
| 107 |
|
| 108 |
+
# self.model.eval().to(self.device, dtype=self.dtype)
|
| 109 |
|
| 110 |
+
# # Load model weights
|
| 111 |
+
# ckpt = torch.load(model_path)
|
| 112 |
+
# self.model.load_state_dict(ckpt, strict=False)
|
| 113 |
+
# del ckpt
|
| 114 |
|
| 115 |
# Initialize sampler
|
| 116 |
transport = create_transport(
|
|
|
|
| 337 |
processed_images.append(blank)
|
| 338 |
if i == grid_h - 1:
|
| 339 |
mask_position.append(1)
|
| 340 |
+
|
| 341 |
+
return processed_images
|
| 342 |
|
| 343 |
if len(mask_position) > 1 and sum(mask_position) > 1:
|
| 344 |
if target_size is None:
|
|
|
|
| 445 |
if True: # images[i] is None:
|
| 446 |
cropped = output_images[-1].crop(((i - row_start) * ret_w // self.grid_w, 0, ((i - row_start) + 1) * ret_w // self.grid_w, ret_h))
|
| 447 |
ret.append(cropped)
|
| 448 |
+
if mask_position[i - row_start] and is_upsampling and upsampling_noise < 1.0:
|
| 449 |
upsampled = self.upsampling(
|
| 450 |
cropped,
|
| 451 |
upsampling_size,
|