import torch import spaces import gradio as gr from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor from PIL import Image # Load model and processor model = Pix2StructForConditionalGeneration.from_pretrained( "google/pix2struct-screen2words-large", dtype=torch.bfloat16 ).to("cuda") model.eval() processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large") # Define the function @spaces.GPU def describe_ui(image, text): # text 為使用者輸入的 prompt,可為空字串 inputs = processor(images=image, text=text or "", return_tensors="pt").to( dtype=torch.bfloat16, device="cuda" ) predictions = model.generate(**inputs) return processor.decode(predictions[0], skip_special_tokens=False) # Launch the Gradio interface gr.Interface( fn=describe_ui, inputs=[ gr.Image(type="pil", label="Upload UI Screenshot"), gr.Textbox(label="Optional prompt / instruction", placeholder="e.g. Describe layout and buttons"), ], outputs=gr.Textbox(label="Model Output"), title="UI Screen Describer (Pix2Struct)", description="Upload a screenshot or UI image and optionally enter a text prompt. The model (Google Pix2Struct) will generate a detailed description.", ).launch()