import gradio as gr from PIL import Image import torch from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler # Load model from HF Hub (your model repo) model_id = "yutengz/Action2Vision" pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( model_id, # torch_dtype=torch.float16, torch_dtype=torch.float32, safety_checker=None, # ).to("cuda") ).to("cpu") pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) def predict(image: Image.Image, prompt: str): image = image.convert("RGB").resize((256, 256)) result = pipe(image=image, prompt=prompt).images[0] return result demo = gr.Interface( fn=predict, inputs=[ gr.Image(type="pil", label="Source Image"), gr.Textbox(label="Instruction Prompt", placeholder="e.g., stack the blocks"), ], outputs=gr.Image(label="Predicted Image"), title="🧠 Action2Vision", description="A fine-tuned InstructPix2Pix model for robotic action frame prediction." ) demo.launch()