VisualCloze
/

VisualClozePipeline-384

@@ -70,30 +70,29 @@ Example with Depth-to-Image:
 import torch
 from diffusers import VisualClozePipeline
 from diffusers.utils import load_image
-from PIL import Image
 # Load in-context images (make sure the paths are correct and accessible)
 image_paths = [
     # in-context examples
     [
-        load_image('https://github.com/lzyhha/VisualCloze/tree/main/examples/examples/5bf755ed9dbb9b3e223e7ba35232b06e/5bf755ed9dbb9b3e223e7ba35232b06e_depth-anything-v2_Large.jpg'),
-        load_image('https://github.com/lzyhha/VisualCloze/tree/main/examples/examples/5bf755ed9dbb9b3e223e7ba35232b06e/5bf755ed9dbb9b3e223e7ba35232b06e.jpg'),
     ],
     # query with the target image
     [
-        load_image('https://github.com/lzyhha/VisualCloze/tree/main/examples/examples/2b74476568f7562a6aa832d423132ed3/2b74476568f7562a6aa832d423132ed3_depth-anything-v2_Large.jpg'),
         None,  # No image needed for the query in this case
     ],
 ]
 # Task and content prompt
 task_prompt = "Each row outlines a logical process, starting from [IMAGE1] gray-based depth map with detailed object contours, to achieve [IMAGE2] an image with flawless clarity."
-content_prompt = """Group photo of five young adults enjoying a rooftop gathering at dusk. The group is positioned in the center, with three women and two men smiling and embracing.
-The woman on the far left wears a floral top and holds a drink, looking slightly to the right.
-Next to her, a woman in a denim jacket stands close to a woman in a white blouse, both smiling directly at the camera.
-The fourth woman, in an orange top, stands close to the man on the far right, who wears a red shirt and blue blazer, smiling broadly.
-The background features a cityscape with a tall building and string lights hanging overhead, creating a warm, festive atmosphere.
-Soft natural lighting, warm color palette, shallow depth of field, intimate and joyful mood, slightly blurred background, urban rooftop setting, evening ambiance."""
 # Load the VisualClozePipeline
 pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", torch_dtype=torch.bfloat16)
@@ -104,11 +103,11 @@ image_result = pipe(
     task_prompt=task_prompt,
     content_prompt=content_prompt,
     image=image_paths,
-    height=1632,
-    width=1232,
     upsampling_strength=0.4,
     guidance_scale=30,
-    num_inference_steps=50,
     max_sequence_length=512,
     generator=torch.Generator("cpu").manual_seed(0)
 ).images[0]

 import torch
 from diffusers import VisualClozePipeline
 from diffusers.utils import load_image
 # Load in-context images (make sure the paths are correct and accessible)
 image_paths = [
     # in-context examples
     [
+        load_image('https://github.com/lzyhha/VisualCloze/raw/main/examples/examples/93bc1c43af2d6c91ac2fc966bf7725a2/93bc1c43af2d6c91ac2fc966bf7725a2_depth-anything-v2_Large.jpg'),
+        load_image('https://github.com/lzyhha/VisualCloze/raw/main/examples/examples/93bc1c43af2d6c91ac2fc966bf7725a2/93bc1c43af2d6c91ac2fc966bf7725a2.jpg'),
     ],
     # query with the target image
     [
+        load_image('https://github.com/lzyhha/VisualCloze/raw/main/examples/examples/79f2ee632f1be3ad64210a641c4e201b/79f2ee632f1be3ad64210a641c4e201b_depth-anything-v2_Large.jpg'),
         None,  # No image needed for the query in this case
     ],
 ]
 # Task and content prompt
 task_prompt = "Each row outlines a logical process, starting from [IMAGE1] gray-based depth map with detailed object contours, to achieve [IMAGE2] an image with flawless clarity."
+content_prompt = """A serene portrait of a young woman with long dark hair, wearing a beige dress with intricate
+gold embroidery, standing in a softly lit room. She holds a large bouquet of pale pink roses in a black box,
+positioned in the center of the frame. The background features a tall green plant to the left and a framed artwork
+on the wall to the right. A window on the left allows natural light to gently illuminate the scene.
+The woman gazes down at the bouquet with a calm expression. Soft natural lighting, warm color palette,
+high contrast, photorealistic, intimate, elegant, visually balanced, serene atmosphere."""
 # Load the VisualClozePipeline
 pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", torch_dtype=torch.bfloat16)
     task_prompt=task_prompt,
     content_prompt=content_prompt,
     image=image_paths,
+    upsampling_width=1024,
+    upsampling_height=1024,
     upsampling_strength=0.4,
     guidance_scale=30,
+    num_inference_steps=30,
     max_sequence_length=512,
     generator=torch.Generator("cpu").manual_seed(0)
 ).images[0]