yutengz
/

Action2Vision

StableDiffusionInstructPix2PixPipeline

Model card Files Files and versions

Metrics Training metrics Community

yutengz commited on May 5, 2025

Commit

02c8b0b

·

verified ·

1 Parent(s): 4d14930

Update README.md

Files changed (1) hide show

README.md +12 -6

README.md CHANGED Viewed

@@ -3,7 +3,7 @@ license: mit
 tags:
 - image-to-image
 ---
-# ip2p-RoboPredict: InstructPix2Pix Fine-tuning for Robotic Action Frame Prediction
 GitHub: https://github.com/yutengzhang03/Action2Vision
 <img src='img/show-example.png'/>
@@ -22,18 +22,24 @@ import PIL
 import requests
 import torch
 from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
 model_id = "yutengz/Action2Vision"
 pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None)
 pipe.to("cuda")
 pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-url = "https://github.com/yutengzhang03/Action2Vision/blob/main/img/source.png"
 def download_image(url):
-    image = PIL.Image.open(requests.get(url, stream=True).raw)
-    image = PIL.ImageOps.exif_transpose(image)
-    image = image.convert("RGB")
     return image
 image = download_image(url)
 prompt = "There is a hammer and a block in the middle of the table. If the block is closer to the left robotic arm, it uses the left arm to pick up the hammer and strike the block; otherwise, it does the opposite."
-images = pipe(prompt, image=image, num_inference_steps=10, image_guidance_scale=1).images
 images[0]
 ```

 tags:
 - image-to-image
 ---
+# Action2Vision: InstructPix2Pix Fine-tuning for Robotic Action Frame Prediction
 GitHub: https://github.com/yutengzhang03/Action2Vision
 <img src='img/show-example.png'/>
 import requests
 import torch
 from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
 model_id = "yutengz/Action2Vision"
 pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None)
 pipe.to("cuda")
 pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+to_tensor = transforms.ToTensor()
+resize = transforms.Resize((256, 256))
 def download_image(url):
+    def download_image(url):
+    image = PIL.Image.open(requests.get(url, stream=True).raw).convert("RGB").resize((256, 256))
     return image
+url = "https://github.com/yutengzhang03/Action2Vision/blob/main/img/source.png"
 image = download_image(url)
 prompt = "There is a hammer and a block in the middle of the table. If the block is closer to the left robotic arm, it uses the left arm to pick up the hammer and strike the block; otherwise, it does the opposite."
+images = pipe(prompt, image=image).images
 images[0]
 ```