import torch import spaces import gradio as gr from transformers import pipeline from PIL import Image from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor # Load model and processor model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-screen2words-large", dtype=torch.bfloat16).to("cuda") model.eval() processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large") # Define the function @spaces.GPU def describe_ui(image): inputs = processor(images=image, text="", return_tensors="pt").to(dtype=torch.bfloat16, device="cuda") predictions = model.generate(**inputs) return processor.decode(predictions[0], skip_special_tokens=False) # Launch the Gradio interface gr.Interface( fn=describe_ui, inputs=gr.Image(type="pil"), outputs="text", title="UI Screen Describer (Pix2Struct)", description="Upload a screenshot or UI image and get an automatic description powered by Google’s Pix2Struct model." ).launch()