UI_Screen_Description_Generator_with_Pix2Struct

Sleeping

UI_Screen_Description_Generator_with_Pix2Struct

File size: 1,023 Bytes

04e2e3b
a9cdb88
981337a
2012122
981337a
b9f48bf
981337a
 
04e2e3b
1dee560
89612fd
b9f48bf
981337a
a9cdb88
981337a
37f620a
b9f48bf
4e80d42
981337a

import torch
import spaces
import gradio as gr
from transformers import pipeline
from PIL import Image
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor

# Load model and processor
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-screen2words-large", dtype=torch.bfloat16).to("cuda")
model.eval()
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large")

# Define the function
@spaces.GPU
def describe_ui(image):
    inputs = processor(images=image, text="", return_tensors="pt").to(dtype=torch.bfloat16, device="cuda")
    predictions = model.generate(**inputs)
    return processor.decode(predictions[0], skip_special_tokens=False)

# Launch the Gradio interface
gr.Interface(
    fn=describe_ui,
    inputs=gr.Image(type="pil"),
    outputs="text",
    title="UI Screen Describer (Pix2Struct)",
    description="Upload a screenshot or UI image and get an automatic description powered by Google’s Pix2Struct model."
).launch()