prithivMLmods commited on
Commit
0f4a1d2
·
verified ·
1 Parent(s): f8804bb

update app

Browse files
Files changed (1) hide show
  1. app.py +205 -188
app.py CHANGED
@@ -1,13 +1,13 @@
1
  import os
2
  import re
3
  import json
 
4
  import time
5
- import shutil
6
- import uuid
7
- import tempfile
8
  import unicodedata
 
 
9
  from io import BytesIO
10
- from typing import Tuple, Optional, List, Iterable
11
 
12
  import gradio as gr
13
  import numpy as np
@@ -17,15 +17,21 @@ from PIL import Image, ImageDraw, ImageFont
17
 
18
  # Transformers & Qwen Utils
19
  from transformers import (
20
- Qwen2_5_VLForConditionalGeneration,
21
  AutoProcessor,
 
22
  )
 
23
  from qwen_vl_utils import process_vision_info
24
 
25
- # Gradio Theme Utils
26
  from gradio.themes import Soft
27
  from gradio.themes.utils import colors, fonts, sizes
28
 
 
 
 
 
29
  colors.steel_blue = colors.Color(
30
  name="steel_blue",
31
  c50="#EBF3F8",
@@ -83,240 +89,255 @@ class SteelBlueTheme(Soft):
83
  )
84
 
85
  steel_blue_theme = SteelBlueTheme()
86
-
87
- css = """
88
- #main-title h1 { font-size: 2.3em !important; }
89
- #out_img { height: 600px; object-fit: contain; }
90
- """
91
 
92
  # -----------------------------------------------------------------------------
93
- # 2. MODEL LOADING (Global Setup)
94
  # -----------------------------------------------------------------------------
95
 
96
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
97
- print(f"Using device: {device}")
98
-
99
- # System Prompt
100
- OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
101
- You need to generate the next action to complete the task.
102
-
103
- Output your action inside a <tool_call> block using JSON format.
104
- Include "coordinate": [x, y] in pixels for interactions.
105
-
106
- Examples:
107
- <tool_call>
108
- {"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}
109
- </tool_call>
110
-
111
- <tool_call>
112
- {"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}
113
- </tool_call>
114
- """
115
-
116
- # Load Fara-7B
117
- print("Loading Fara-7B...")
118
- MODEL_ID_V = "microsoft/Fara-7B"
119
- processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
120
- model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
121
- MODEL_ID_V,
122
- trust_remote_code=True,
123
- torch_dtype=torch.bfloat16
124
- ).to(device).eval()
125
-
126
- # Load UI-TARS-1.5-7B
127
- print("Loading UI-TARS-1.5-7B...")
128
- # Note: Using the official SFT repo. Adjust if you have a specific private repo.
129
- MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B"
130
- processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
131
- model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
132
- MODEL_ID_X,
133
- trust_remote_code=True,
134
- torch_dtype=torch.bfloat16,
135
- ).to(device).eval()
136
-
137
- print("✅ All Models Loaded Successfully")
 
 
 
 
 
138
 
139
  # -----------------------------------------------------------------------------
140
- # 3. UTILS: IMAGE, PARSING, VISUALIZATION
141
  # -----------------------------------------------------------------------------
142
 
143
  def array_to_image(image_array: np.ndarray) -> Image.Image:
144
- if image_array is None:
145
- raise ValueError("No image provided. Please upload an image.")
146
  return Image.fromarray(np.uint8(image_array))
147
 
148
- def get_navigation_prompt(task, image):
 
 
 
 
 
 
 
 
 
149
  return [
150
  {"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
151
- {"role": "user", "content": [
152
- {"type": "image", "image": image},
153
- {"type": "text", "text": f"Instruction: {task}"},
154
- ]},
155
  ]
156
 
157
- def parse_tool_calls(response: str) -> list[dict]:
158
- """
159
- Parses the <tool_call>{JSON}</tool_call> format.
160
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  actions = []
162
- matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
 
 
 
 
 
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  for match in matches:
165
  try:
166
- json_str = match.strip()
167
- data = json.loads(json_str)
168
-
169
  args = data.get("arguments", {})
170
  coords = args.get("coordinate", [])
171
  action_type = args.get("action", "unknown")
172
  text_content = args.get("text", "")
173
-
174
- if coords and isinstance(coords, list) and len(coords) == 2:
175
- actions.append({
176
- "type": action_type,
177
- "x": float(coords[0]),
178
- "y": float(coords[1]),
179
- "text": text_content,
180
- "raw_json": data
181
- })
182
- print(f"Parsed Action: {action_type} at {coords}")
183
- else:
184
- # Handle actions without coordinates (like pressing enter generally)
185
  actions.append({
186
- "type": action_type,
187
- "text": text_content,
188
- "raw_json": data
189
  })
190
-
191
- except json.JSONDecodeError:
192
- print(f"Failed to parse JSON: {match}")
193
-
194
  return actions
195
 
196
  def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
197
- """Draws markers on the image based on parsed pixel coordinates."""
198
- if not actions:
199
- return None
200
-
201
  img_copy = original_image.copy()
202
  draw = ImageDraw.Draw(img_copy)
203
  width, height = img_copy.size
204
 
205
- try:
206
- font = ImageFont.load_default()
207
- except:
208
- font = None
209
-
210
- colors = {
211
- 'type': 'blue',
212
- 'click': 'red',
213
- 'left_click': 'red',
214
- 'right_click': 'purple',
215
- 'double_click': 'orange',
216
- 'unknown': 'green'
217
- }
218
 
219
  for act in actions:
220
- # Only draw if coordinates exist
221
- if 'x' not in act or 'y' not in act:
222
- continue
223
-
224
  x = act['x']
225
  y = act['y']
226
 
227
- # Check if Normalized (0.0 - 1.0) or Absolute (Pixels > 1.0)
228
  if x <= 1.0 and y <= 1.0 and x > 0:
229
- pixel_x = int(x * width)
230
- pixel_y = int(y * height)
231
  else:
232
- pixel_x = int(x)
233
- pixel_y = int(y)
234
 
235
- action_type = act['type']
236
- color = colors.get(action_type, 'green')
237
 
238
- # Draw Circle Target
239
- r = 12
240
  draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4)
241
  draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
242
 
243
- # Draw Label text
244
- label_text = f"{action_type}"
245
- if act['text']:
246
- label_text += f": '{act['text']}'"
247
-
248
- text_pos = (pixel_x + 15, pixel_y - 10)
249
- bbox = draw.textbbox(text_pos, label_text, font=font)
250
- draw.rectangle(bbox, fill="black")
251
- draw.text(text_pos, label_text, fill="white", font=font)
252
 
253
  return img_copy
254
 
255
  # -----------------------------------------------------------------------------
256
- # 4. PROCESSING LOGIC
257
  # -----------------------------------------------------------------------------
258
 
259
- @spaces.GPU
260
- def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str) -> Tuple[str, Optional[Image.Image]]:
261
- if input_numpy_image is None:
262
- return "⚠️ Please upload an image first.", None
263
 
264
- # 1. Select Model
265
- if model_choice == "Fara-7B":
266
- model = model_v
267
- processor = processor_v
268
- elif model_choice == "UI-TARS-1.5-7B":
269
- model = model_x
270
- processor = processor_x
271
- else:
272
- return "Invalid model selection", None
273
-
274
- # 2. Prepare Data
275
  input_pil_image = array_to_image(input_numpy_image)
276
- prompt = get_navigation_prompt(task, input_pil_image)
277
-
278
- # 3. Generate
279
- text_prompts = processor.apply_chat_template(
280
- prompt, tokenize=False, add_generation_prompt=True
281
- )
282
- image_inputs, video_inputs = process_vision_info(prompt)
283
-
284
- inputs = processor(
285
- text=[text_prompts],
286
- images=image_inputs,
287
- videos=video_inputs,
288
- padding=True,
289
- return_tensors="pt",
290
- )
291
- inputs = inputs.to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
- print(f"Generating with {model_choice}...")
294
- with torch.no_grad():
295
- generated_ids = model.generate(**inputs, max_new_tokens=512)
296
-
297
- generated_ids_trimmed = [
298
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
299
- ]
300
-
301
- raw_response = processor.batch_decode(
302
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
303
- )[0]
304
-
305
- print(f"Raw Output:\n{raw_response}")
306
-
307
- # 4. Parse & Visualize
308
- actions = parse_tool_calls(raw_response)
309
-
310
  output_image = input_pil_image
311
  if actions:
312
- visualized = create_localized_image(input_pil_image, actions)
313
- if visualized:
314
- output_image = visualized
315
 
316
  return raw_response, output_image
317
 
318
  # -----------------------------------------------------------------------------
319
- # 5. GRADIO UI
320
  # -----------------------------------------------------------------------------
321
 
322
  with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
@@ -344,23 +365,19 @@ with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
344
 
345
  with gr.Column(scale=3):
346
  output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
347
- output_text = gr.Textbox(label="Raw Model Output (JSON)", lines=8, show_copy_button=True)
348
 
349
- # Wire up the button
350
  submit_btn.click(
351
  fn=process_screenshot,
352
  inputs=[input_image, task_input, model_choice],
353
  outputs=[output_text, output_image]
354
  )
355
 
356
- # Examples
357
  gr.Examples(
358
- examples=[
359
- ["./assets/google.png", "Search for 'Hugging Face'", "Fara-7B"],
360
- ],
361
  inputs=[input_image, task_input, model_choice],
362
  label="Quick Examples"
363
  )
364
 
365
  if __name__ == "__main__":
366
- demo.queue(max_size=20).launch(show_error=True)
 
1
  import os
2
  import re
3
  import json
4
+ import gc
5
  import time
 
 
 
6
  import unicodedata
7
+ import traceback
8
+ import contextlib
9
  from io import BytesIO
10
+ from typing import Tuple, Optional, List, Dict, Any
11
 
12
  import gradio as gr
13
  import numpy as np
 
17
 
18
  # Transformers & Qwen Utils
19
  from transformers import (
20
+ Qwen2_5_VLForConditionalGeneration,
21
  AutoProcessor,
22
+ AutoModelForImageTextToText
23
  )
24
+ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
25
  from qwen_vl_utils import process_vision_info
26
 
27
+ # Gradio Theme
28
  from gradio.themes import Soft
29
  from gradio.themes.utils import colors, fonts, sizes
30
 
31
+ # -----------------------------------------------------------------------------
32
+ # 1. THEME CONFIGURATION
33
+ # -----------------------------------------------------------------------------
34
+
35
  colors.steel_blue = colors.Color(
36
  name="steel_blue",
37
  c50="#EBF3F8",
 
89
  )
90
 
91
  steel_blue_theme = SteelBlueTheme()
92
+ css = "#main-title h1 { font-size: 2.3em !important; } #out_img { height: 600px; object-fit: contain; }"
 
 
 
 
93
 
94
  # -----------------------------------------------------------------------------
95
+ # 2. MODEL MANAGEMENT
96
  # -----------------------------------------------------------------------------
97
 
98
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
99
+
100
+ current_model_state = {"model": None, "processor": None, "name": None}
101
+
102
+ def load_fara_model():
103
+ print("🔄 Loading Fara-7B...")
104
+ MODEL_ID_V = "microsoft/Fara-7B"
105
+ processor = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
106
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
107
+ MODEL_ID_V, trust_remote_code=True, torch_dtype=torch.float16
108
+ ).to(DEVICE).eval()
109
+ return model, processor
110
+
111
+ def load_uitars_model():
112
+ print("🔄 Loading UI-TARS-1.5-7B...")
113
+ MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B" # Updated to official HF ID
114
+ try:
115
+ model = AutoModelForImageTextToText.from_pretrained(
116
+ MODEL_ID_X, torch_dtype=torch.float16, trust_remote_code=True
117
+ ).to(DEVICE).eval()
118
+ # Important: use_fast=False for UI-TARS compat
119
+ processor = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
120
+ return model, processor
121
+ except Exception as e:
122
+ print(f"Error loading UI-TARS: {e}")
123
+ raise e
124
+
125
+ def get_model_pipeline(model_choice: str):
126
+ global current_model_state
127
+ if current_model_state["name"] == model_choice and current_model_state["model"] is not None:
128
+ return current_model_state["model"], current_model_state["processor"]
129
+
130
+ if current_model_state["model"] is not None:
131
+ del current_model_state["model"]
132
+ del current_model_state["processor"]
133
+ gc.collect()
134
+ torch.cuda.empty_cache()
135
+
136
+ if model_choice == "Fara-7B":
137
+ model, processor = load_fara_model()
138
+ else:
139
+ model, processor = load_uitars_model()
140
+
141
+ current_model_state["model"] = model
142
+ current_model_state["processor"] = processor
143
+ current_model_state["name"] = model_choice
144
+ return model, processor
145
 
146
  # -----------------------------------------------------------------------------
147
+ # 3. UTILS & PROMPTS
148
  # -----------------------------------------------------------------------------
149
 
150
  def array_to_image(image_array: np.ndarray) -> Image.Image:
151
+ if image_array is None: raise ValueError("No image provided.")
 
152
  return Image.fromarray(np.uint8(image_array))
153
 
154
+ # Fara Prompt
155
+ def get_fara_prompt(task, image):
156
+ OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
157
+ You need to generate the next action to complete the task.
158
+ Output your action inside a <tool_call> block using JSON format.
159
+ Include "coordinate": [x, y] in pixels for interactions.
160
+ Examples:
161
+ <tool_call>{"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}</tool_call>
162
+ <tool_call>{"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}</tool_call>
163
+ """
164
  return [
165
  {"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
166
+ {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": f"Instruction: {task}"}]},
 
 
 
167
  ]
168
 
169
+ # UI-TARS Prompt
170
+ def get_uitars_prompt(task, image):
171
+ guidelines = (
172
+ "Localize an element on the GUI image according to my instructions and "
173
+ "output a click position as Click(x, y) with x num pixels from the left edge "
174
+ "and y num pixels from the top edge."
175
+ )
176
+ return [
177
+ {
178
+ "role": "user",
179
+ "content": [
180
+ {"type": "image", "image": image},
181
+ {"type": "text", "text": f"{guidelines}\n{task}"}
182
+ ],
183
+ }
184
+ ]
185
+
186
+ def get_image_proc_params(processor) -> Dict[str, int]:
187
+ ip = getattr(processor, "image_processor", None)
188
+ return {
189
+ "patch_size": getattr(ip, "patch_size", 14),
190
+ "merge_size": getattr(ip, "merge_size", 2), # Adjusted for typical TARS
191
+ "min_pixels": getattr(ip, "min_pixels", 256 * 256),
192
+ "max_pixels": getattr(ip, "max_pixels", 1280 * 1280),
193
+ }
194
+
195
+ # -----------------------------------------------------------------------------
196
+ # 4. PARSING LOGIC
197
+ # -----------------------------------------------------------------------------
198
+
199
+ def parse_uitars_response(text: str, img_w: int, img_h: int) -> List[Dict]:
200
+ """Parse UI-TARS specific output formats"""
201
  actions = []
202
+ # 1. Click(x,y)
203
+ m = re.search(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text)
204
+ if m:
205
+ x, y = int(m.group(1)), int(m.group(2))
206
+ actions.append({"type": "click", "x": x, "y": y, "text": ""})
207
+ return actions
208
 
209
+ # 2. start_box='(x,y)'
210
+ m = re.search(r"start_box=['\"]\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]", text)
211
+ if m:
212
+ x, y = int(m.group(1)), int(m.group(2))
213
+ actions.append({"type": "click", "x": x, "y": y, "text": ""})
214
+ return actions
215
+
216
+ return actions
217
+
218
+ def parse_fara_response(response: str) -> List[Dict]:
219
+ """Parse Fara <tool_call> JSON format"""
220
+ actions = []
221
+ matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
222
  for match in matches:
223
  try:
224
+ data = json.loads(match.strip())
 
 
225
  args = data.get("arguments", {})
226
  coords = args.get("coordinate", [])
227
  action_type = args.get("action", "unknown")
228
  text_content = args.get("text", "")
229
+ if coords and len(coords) == 2:
 
 
 
 
 
 
 
 
 
 
 
230
  actions.append({
231
+ "type": action_type, "x": float(coords[0]), "y": float(coords[1]), "text": text_content
 
 
232
  })
233
+ except: pass
 
 
 
234
  return actions
235
 
236
  def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
237
+ if not actions: return None
 
 
 
238
  img_copy = original_image.copy()
239
  draw = ImageDraw.Draw(img_copy)
240
  width, height = img_copy.size
241
 
242
+ try: font = ImageFont.load_default()
243
+ except: font = None
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  for act in actions:
 
 
 
 
246
  x = act['x']
247
  y = act['y']
248
 
249
+ # Normalize check
250
  if x <= 1.0 and y <= 1.0 and x > 0:
251
+ pixel_x, pixel_y = int(x * width), int(y * height)
 
252
  else:
253
+ pixel_x, pixel_y = int(x), int(y)
 
254
 
255
+ color = 'red' if 'click' in act['type'] else 'blue'
 
256
 
257
+ # Draw Target
258
+ r = 15
259
  draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4)
260
  draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
261
 
262
+ # Draw Label
263
+ label = f"{act['type']}: {act['text']}" if act['text'] else act['type']
264
+ text_pos = (pixel_x + 18, pixel_y - 12)
265
+ bbox = draw.textbbox(text_pos, label, font=font)
266
+ draw.rectangle((bbox[0]-2, bbox[1]-2, bbox[2]+2, bbox[3]+2), fill="black")
267
+ draw.text(text_pos, label, fill="white", font=font)
 
 
 
268
 
269
  return img_copy
270
 
271
  # -----------------------------------------------------------------------------
272
+ # 5. CORE LOGIC
273
  # -----------------------------------------------------------------------------
274
 
275
+ @spaces.GPU(duration=120)
276
+ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
277
+ if input_numpy_image is None: return "⚠️ Please upload an image.", None
 
278
 
279
+ # 1. Load Model
280
+ model, processor = get_model_pipeline(model_choice)
 
 
 
 
 
 
 
 
 
281
  input_pil_image = array_to_image(input_numpy_image)
282
+ orig_w, orig_h = input_pil_image.size
283
+
284
+ # 2. Preprocess & Generate
285
+ if model_choice == "UI-TARS-1.5-7B":
286
+ # Specific UI-TARS resizing logic
287
+ ip_params = get_image_proc_params(processor)
288
+ resized_h, resized_w = smart_resize(
289
+ input_pil_image.height, input_pil_image.width,
290
+ factor=ip_params["patch_size"] * ip_params["merge_size"],
291
+ min_pixels=ip_params["min_pixels"], max_pixels=ip_params["max_pixels"]
292
+ )
293
+ proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
294
+ messages = get_uitars_prompt(task, proc_image)
295
+
296
+ # UI-TARS uses apply_chat_template but often requires manual text construction internally
297
+ # We'll rely on the standard processor flow which handles this if trust_remote_code=True
298
+ text_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
299
+ inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
300
+ inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
301
+
302
+ with torch.no_grad():
303
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
304
+
305
+ # Decode
306
+ generated_ids = [out_ids[len(in_seq):] for in_seq, out_ids in zip(inputs.get("input_ids"), generated_ids)]
307
+ raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
308
+
309
+ # Parse (Scaling coordinates back to original size)
310
+ actions = parse_uitars_response(raw_response, resized_w, resized_h)
311
+ # Scale back coordinates
312
+ scale_x, scale_y = orig_w / resized_w, orig_h / resized_h
313
+ for a in actions:
314
+ a['x'] = int(a['x'] * scale_x)
315
+ a['y'] = int(a['y'] * scale_y)
316
+
317
+ else: # Fara-7B
318
+ messages = get_fara_prompt(task, input_pil_image)
319
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
320
+ image_inputs, video_inputs = process_vision_info(messages)
321
+ inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
322
+ inputs = inputs.to(DEVICE)
323
+
324
+ with torch.no_grad():
325
+ generated_ids = model.generate(**inputs, max_new_tokens=512)
326
+
327
+ generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
328
+ raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
329
+ actions = parse_fara_response(raw_response)
330
 
331
+ # 3. Visualize
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  output_image = input_pil_image
333
  if actions:
334
+ vis = create_localized_image(input_pil_image, actions)
335
+ if vis: output_image = vis
 
336
 
337
  return raw_response, output_image
338
 
339
  # -----------------------------------------------------------------------------
340
+ # 6. UI SETUP
341
  # -----------------------------------------------------------------------------
342
 
343
  with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
 
365
 
366
  with gr.Column(scale=3):
367
  output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
368
+ output_text = gr.Textbox(label="Raw Model Output", lines=8, show_copy_button=True)
369
 
 
370
  submit_btn.click(
371
  fn=process_screenshot,
372
  inputs=[input_image, task_input, model_choice],
373
  outputs=[output_text, output_image]
374
  )
375
 
 
376
  gr.Examples(
377
+ examples=[["./assets/google.png", "Search for 'Hugging Face'", "Fara-7B"]],
 
 
378
  inputs=[input_image, task_input, model_choice],
379
  label="Quick Examples"
380
  )
381
 
382
  if __name__ == "__main__":
383
+ demo.queue().launch()