import gradio as gr
from PIL import Image
import torch
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler

# Load model from HF Hub (your model repo)
model_id = "yutengz/Action2Vision"

pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
    model_id,
    # torch_dtype=torch.float16,
    torch_dtype=torch.float32,
    safety_checker=None,
# ).to("cuda")
    ).to("cpu")

pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

def predict(image: Image.Image, prompt: str):
    image = image.convert("RGB").resize((256, 256))
    result = pipe(image=image, prompt=prompt).images[0]
    return result

demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Image(type="pil", label="Source Image"),
        gr.Textbox(label="Instruction Prompt", placeholder="e.g., stack the blocks"),
    ],
    outputs=gr.Image(label="Predicted Image"),
    title="🧠 Action2Vision",
    description="A fine-tuned InstructPix2Pix model for robotic action frame prediction."
)

demo.launch()