| import torch |
| import spaces |
| import gradio as gr |
| from transformers import pipeline |
| from PIL import Image |
| from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor |
|
|
| |
| model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-screen2words-large", dtype=torch.bfloat16).to("cuda") |
| model.eval() |
| processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large") |
|
|
| |
| @spaces.GPU |
| def describe_ui(image): |
| inputs = processor(images=image, text="This is an image of ", return_tensors="pt").to(dtype=torch.bfloat16, device="cuda") |
| predictions = model.generate(**inputs) |
| return processor.decode(predictions[0], skip_special_tokens=False) |
|
|
| |
| gr.Interface( |
| fn=describe_ui, |
| inputs=gr.Image(type="pil"), |
| outputs="text", |
| title="UI Screen Describer (Pix2Struct)", |
| description="Upload a screenshot or UI image and get an automatic description powered by Google’s Pix2Struct model." |
| ).launch() |
|
|