UI_Screen_Description_Generator_with_Pix2Struct

Sleeping

AlexHung29629 commited on Nov 3, 2025

Commit

6ef828c

verified ·

1 Parent(s): 37f620a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,27 +1,34 @@
 import torch
 import spaces
 import gradio as gr
-from transformers import pipeline
-from PIL import Image
 from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
 # Load model and processor
-model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-screen2words-large", dtype=torch.bfloat16).to("cuda")
 model.eval()
 processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large")
 # Define the function
 @spaces.GPU
-def describe_ui(image):
-    inputs = processor(images=image, text="", return_tensors="pt").to(dtype=torch.bfloat16, device="cuda")
     predictions = model.generate(**inputs)
     return processor.decode(predictions[0], skip_special_tokens=False)
 # Launch the Gradio interface
 gr.Interface(
     fn=describe_ui,
-    inputs=gr.Image(type="pil"),
-    outputs="text",
     title="UI Screen Describer (Pix2Struct)",
-    description="Upload a screenshot or UI image and get an automatic description powered by Google’s Pix2Struct model."
 ).launch()

 import torch
 import spaces
 import gradio as gr
 from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+from PIL import Image
 # Load model and processor
+model = Pix2StructForConditionalGeneration.from_pretrained(
+    "google/pix2struct-screen2words-large", dtype=torch.bfloat16
+).to("cuda")
 model.eval()
 processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large")
 # Define the function
 @spaces.GPU
+def describe_ui(image, text):
+    # text 為使用者輸入的 prompt，可為空字串
+    inputs = processor(images=image, text=text or "", return_tensors="pt").to(
+        dtype=torch.bfloat16, device="cuda"
+    )
     predictions = model.generate(**inputs)
     return processor.decode(predictions[0], skip_special_tokens=False)
 # Launch the Gradio interface
 gr.Interface(
     fn=describe_ui,
+    inputs=[
+        gr.Image(type="pil", label="Upload UI Screenshot"),
+        gr.Textbox(label="Optional prompt / instruction", placeholder="e.g. Describe layout and buttons"),
+    ],
+    outputs=gr.Textbox(label="Model Output"),
     title="UI Screen Describer (Pix2Struct)",
+    description="Upload a screenshot or UI image and optionally enter a text prompt. The model (Google Pix2Struct) will generate a detailed description.",
 ).launch()