UI_Screen_Description_Generator_with_Pix2Struct

Sleeping

UI_Screen_Description_Generator_with_Pix2Struct

File size: 1,293 Bytes

04e2e3b
a9cdb88
981337a
b9f48bf
6ef828c
981337a
 
6ef828c
 
 
1dee560
89612fd
b9f48bf
981337a
a9cdb88
6ef828c
 
 
 
 
b9f48bf
4e80d42
981337a
 
 
 
6ef828c
 
 
 
 
981337a
6ef828c
981337a

import torch
import spaces
import gradio as gr
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
from PIL import Image

# Load model and processor
model = Pix2StructForConditionalGeneration.from_pretrained(
    "google/pix2struct-screen2words-large", dtype=torch.bfloat16
).to("cuda")
model.eval()
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large")

# Define the function
@spaces.GPU
def describe_ui(image, text):
    # text 為使用者輸入的 prompt，可為空字串
    inputs = processor(images=image, text=text or "", return_tensors="pt").to(
        dtype=torch.bfloat16, device="cuda"
    )
    predictions = model.generate(**inputs)
    return processor.decode(predictions[0], skip_special_tokens=False)

# Launch the Gradio interface
gr.Interface(
    fn=describe_ui,
    inputs=[
        gr.Image(type="pil", label="Upload UI Screenshot"),
        gr.Textbox(label="Optional prompt / instruction", placeholder="e.g. Describe layout and buttons"),
    ],
    outputs=gr.Textbox(label="Model Output"),
    title="UI Screen Describer (Pix2Struct)",
    description="Upload a screenshot or UI image and optionally enter a text prompt. The model (Google Pix2Struct) will generate a detailed description.",
).launch()