prithivMLmods commited on
Commit
d87b209
·
verified ·
1 Parent(s): e207530

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -82
app.py CHANGED
@@ -5,7 +5,6 @@ import time
5
  import unicodedata
6
  import gc
7
  from io import BytesIO
8
- from typing import Iterable
9
  from typing import Tuple, Optional, List, Dict, Any
10
 
11
  import gradio as gr
@@ -114,9 +113,8 @@ except Exception as e:
114
 
115
  # --- Load UI-TARS-1.5-7B ---
116
  print("🔄 Loading UI-TARS-1.5-7B...")
117
- MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B"
118
  try:
119
- # Important: use_fast=False is often required for custom tokenizers
120
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
121
  model_x = AutoModelForImageTextToText.from_pretrained(
122
  MODEL_ID_X,
@@ -128,7 +126,22 @@ except Exception as e:
128
  model_x = None
129
  processor_x = None
130
 
131
- print("✅ Models loading sequence complete.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  # -----------------------------------------------------------------------------
134
  # 3. UTILS & PROMPTS
@@ -155,7 +168,6 @@ def get_fara_prompt(task, image):
155
 
156
  # --- UI-TARS Prompt ---
157
  def get_uitars_prompt(task, image):
158
- # UI-TARS generally responds better to a simpler instruction when finetuned
159
  guidelines = (
160
  "Localize an element on the GUI image according to my instructions and "
161
  "output a click position as Click(x, y) with x num pixels from the left edge "
@@ -171,6 +183,19 @@ def get_uitars_prompt(task, image):
171
  }
172
  ]
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  def get_image_proc_params(processor) -> Dict[str, int]:
175
  ip = getattr(processor, "image_processor", None)
176
  return {
@@ -185,39 +210,20 @@ def get_image_proc_params(processor) -> Dict[str, int]:
185
  # -----------------------------------------------------------------------------
186
 
187
  def parse_uitars_response(text: str) -> List[Dict]:
188
- """Parse various UI-TARS output formats"""
189
  actions = []
190
  text = text.strip()
191
 
192
- # Debug print
193
- print(f"Parsing UI-TARS output: {text}")
194
-
195
- # Regex 1: Click(x, y) - Standard prompt output
196
- # Matches: Click(123, 456) or Click(123,456)
197
- matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
198
- for m in matches_click:
199
- actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
200
-
201
- # Regex 2: point=[x, y] - Common model internal format
202
- matches_point = re.findall(r"point=\[\s*(\d+)\s*,\s*(\d+)\s*\]", text, re.IGNORECASE)
203
- for m in matches_point:
204
- actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
205
-
206
- # Regex 3: start_box='(x, y)' - Another variant
207
- matches_box = re.findall(r"start_box=['\"]?\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]?", text, re.IGNORECASE)
208
- for m in matches_box:
209
- actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
210
-
211
- # Remove duplicates if any logic matched multiple times
212
- unique_actions = []
213
- seen = set()
214
- for a in actions:
215
- key = (a['type'], a['x'], a['y'])
216
- if key not in seen:
217
- seen.add(key)
218
- unique_actions.append(a)
219
-
220
- return unique_actions
221
 
222
  def parse_fara_response(response: str) -> List[Dict]:
223
  """Parse Fara <tool_call> JSON format"""
@@ -237,6 +243,74 @@ def parse_fara_response(response: str) -> List[Dict]:
237
  except: pass
238
  return actions
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
241
  if not actions: return None
242
  img_copy = original_image.copy()
@@ -250,32 +324,35 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
250
  x = act['x']
251
  y = act['y']
252
 
253
- # Determine if we need to scale normalized coords (0-1) or use absolute
254
- # UI-TARS usually outputs absolute pixels relative to the image size it saw.
255
- # But we already scaled them in the main loop.
256
- # Double check sanity:
257
- if x < 1.0 and y < 1.0:
258
- pixel_x, pixel_y = int(x * width), int(y * height)
 
 
 
259
  else:
260
- pixel_x, pixel_y = int(x), int(y)
 
261
 
262
  color = 'red' if 'click' in act['type'].lower() else 'blue'
263
 
264
- # Draw Target Crosshair/Circle
265
  r = 15
266
- line_width = 4
267
-
268
- # Circle
269
- draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=line_width)
270
- # Center dot
271
  draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
272
 
 
 
 
 
273
  # Label
274
  label = f"{act['type']}"
275
  if act['text']: label += f": {act['text']}"
276
 
277
  text_pos = (pixel_x + 20, pixel_y - 10)
278
- # Draw text background
279
  bbox = draw.textbbox(text_pos, label, font=font)
280
  draw.rectangle((bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2), fill="black")
281
  draw.text(text_pos, label, fill="white", font=font)
@@ -288,18 +365,19 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
288
 
289
  @spaces.GPU(duration=120)
290
  def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
291
- if input_numpy_image is None: return "⚠️ Please upload an image.", None
292
 
293
  input_pil_image = array_to_image(input_numpy_image)
294
  orig_w, orig_h = input_pil_image.size
 
 
 
295
 
296
  # --- UI-TARS Logic ---
297
  if model_choice == "UI-TARS-1.5-7B":
298
- if model_x is None: return "Error: UI-TARS model failed to load on startup.", None
299
- print("Using UI-TARS Pipeline...")
300
 
301
- # 1. Smart Resize (Crucial for UI-TARS accuracy)
302
- # We must resize the image to the resolution the model expects/handles best
303
  ip_params = get_image_proc_params(processor_x)
304
  resized_h, resized_w = smart_resize(
305
  input_pil_image.height, input_pil_image.width,
@@ -308,50 +386,56 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
308
  )
309
  proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
310
 
311
- # 2. Prompting
312
  messages = get_uitars_prompt(task, proc_image)
313
  text_prompt = processor_x.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
314
-
315
- # 3. Inputs
316
  inputs = processor_x(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
317
  inputs = {k: v.to(device) for k, v in inputs.items()}
318
 
319
- # 4. Generate
320
  with torch.no_grad():
321
  generated_ids = model_x.generate(**inputs, max_new_tokens=128)
322
 
323
- # Decode
324
  generated_ids = [out_ids[len(in_seq):] for in_seq, out_ids in zip(inputs.get("input_ids"), generated_ids)]
325
  raw_response = processor_x.batch_decode(generated_ids, skip_special_tokens=True)[0]
326
 
327
- # 5. Parse
328
  actions = parse_uitars_response(raw_response)
329
 
330
- # 6. Rescale Coordinates back to Original Image Size
331
- # The model saw 'resized_w' x 'resized_h', so coordinates are in that space.
332
- # We need to map them back to 'orig_w' x 'orig_h' for the visualizer.
333
  scale_x = orig_w / resized_w
334
  scale_y = orig_h / resized_h
335
-
336
  for a in actions:
337
  a['x'] = int(a['x'] * scale_x)
338
  a['y'] = int(a['y'] * scale_y)
339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  # --- Fara Logic ---
341
  else:
342
- if model_v is None: return "Error: Fara model failed to load on startup.", None
343
- print("Using Fara Pipeline...")
344
  messages = get_fara_prompt(task, input_pil_image)
345
  text_prompt = processor_v.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
346
  image_inputs, video_inputs = process_vision_info(messages)
347
-
348
- inputs = processor_v(
349
- text=[text_prompt],
350
- images=image_inputs,
351
- videos=video_inputs,
352
- padding=True,
353
- return_tensors="pt"
354
- )
355
  inputs = inputs.to(device)
356
 
357
  with torch.no_grad():
@@ -359,20 +443,22 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
359
 
360
  generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
361
  raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
362
-
363
- # Fara usually outputs exact pixels based on original image
364
  actions = parse_fara_response(raw_response)
365
 
366
- print(f"Raw Output: {raw_response}")
367
- print(f"Parsed Actions: {actions}")
368
 
369
- # 3. Visualize
370
  output_image = input_pil_image
371
  if actions:
372
  vis = create_localized_image(input_pil_image, actions)
373
  if vis: output_image = vis
374
 
375
- return raw_response, output_image
 
 
 
 
376
 
377
  # -----------------------------------------------------------------------------
378
  # 6. UI SETUP
@@ -388,7 +474,7 @@ with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
388
 
389
  with gr.Row():
390
  model_choice = gr.Radio(
391
- choices=["Fara-7B", "UI-TARS-1.5-7B"],
392
  label="Select Model",
393
  value="Fara-7B",
394
  interactive=True
@@ -403,7 +489,7 @@ with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
403
 
404
  with gr.Column(scale=3):
405
  output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
406
- output_text = gr.Textbox(label="Raw Model Output", lines=8, show_copy_button=True)
407
 
408
  submit_btn.click(
409
  fn=process_screenshot,
 
5
  import unicodedata
6
  import gc
7
  from io import BytesIO
 
8
  from typing import Tuple, Optional, List, Dict, Any
9
 
10
  import gradio as gr
 
113
 
114
  # --- Load UI-TARS-1.5-7B ---
115
  print("🔄 Loading UI-TARS-1.5-7B...")
116
+ MODEL_ID_X = "bytedance/UI-TARS-7B-SFT"
117
  try:
 
118
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
119
  model_x = AutoModelForImageTextToText.from_pretrained(
120
  MODEL_ID_X,
 
126
  model_x = None
127
  processor_x = None
128
 
129
+ # --- Load Holo2-8B ---
130
+ print("🔄 Loading Holo2-8B...")
131
+ MODEL_ID_H = "Hcompany/Holo2-8B"
132
+ try:
133
+ processor_h = AutoProcessor.from_pretrained(MODEL_ID_H, trust_remote_code=True)
134
+ model_h = AutoModelForImageTextToText.from_pretrained(
135
+ MODEL_ID_H,
136
+ trust_remote_code=True,
137
+ torch_dtype=torch.float16
138
+ ).to(device).eval()
139
+ except Exception as e:
140
+ print(f"Failed to load Holo2: {e}")
141
+ model_h = None
142
+ processor_h = None
143
+
144
+ print("✅ All Models Loaded Sequence Complete.")
145
 
146
  # -----------------------------------------------------------------------------
147
  # 3. UTILS & PROMPTS
 
168
 
169
  # --- UI-TARS Prompt ---
170
  def get_uitars_prompt(task, image):
 
171
  guidelines = (
172
  "Localize an element on the GUI image according to my instructions and "
173
  "output a click position as Click(x, y) with x num pixels from the left edge "
 
183
  }
184
  ]
185
 
186
+ # --- Holo2 Prompt ---
187
+ def get_holo2_prompt(task, image):
188
+ # Holo2 typically uses standard chat formatting
189
+ return [
190
+ {
191
+ "role": "user",
192
+ "content": [
193
+ {"type": "image", "image": image},
194
+ {"type": "text", "text": task}
195
+ ]
196
+ }
197
+ ]
198
+
199
  def get_image_proc_params(processor) -> Dict[str, int]:
200
  ip = getattr(processor, "image_processor", None)
201
  return {
 
210
  # -----------------------------------------------------------------------------
211
 
212
  def parse_uitars_response(text: str) -> List[Dict]:
213
+ """Parse UI-TARS specific output formats"""
214
  actions = []
215
  text = text.strip()
216
 
217
+ m = re.search(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
218
+ if m: actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
219
+
220
+ m = re.findall(r"point=\[\s*(\d+)\s*,\s*(\d+)\s*\]", text, re.IGNORECASE)
221
+ for p in m: actions.append({"type": "click", "x": int(p[0]), "y": int(p[1]), "text": ""})
222
+
223
+ m = re.search(r"start_box=['\"]?\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]?", text, re.IGNORECASE)
224
+ if m: actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
225
+
226
+ return actions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  def parse_fara_response(response: str) -> List[Dict]:
229
  """Parse Fara <tool_call> JSON format"""
 
243
  except: pass
244
  return actions
245
 
246
+ def parse_holo2_response(generated_ids, processor, input_len) -> Tuple[str, str, List[Dict]]:
247
+ """Parse Holo2 reasoning tokens and JSON content"""
248
+ all_ids = generated_ids[0].tolist()
249
+
250
+ # Token IDs for <|thought_start|> and <|thought_end|> (Qwen/Holo specific)
251
+ THOUGHT_START = 151667
252
+ THOUGHT_END = 151668
253
+
254
+ thinking_content = ""
255
+ content = ""
256
+
257
+ try:
258
+ if THOUGHT_START in all_ids:
259
+ start_idx = all_ids.index(THOUGHT_START)
260
+ try:
261
+ end_idx = all_ids.index(THOUGHT_END)
262
+ except ValueError:
263
+ end_idx = len(all_ids)
264
+
265
+ thinking_ids = all_ids[start_idx+1:end_idx]
266
+ thinking_content = processor.decode(thinking_ids, skip_special_tokens=True).strip()
267
+
268
+ # Content is everything after thought_end
269
+ content_ids = all_ids[end_idx+1:]
270
+ content = processor.decode(content_ids, skip_special_tokens=True).strip()
271
+ else:
272
+ # Fallback if no reasoning tokens found (just raw output)
273
+ # Slice off input tokens first
274
+ output_ids = all_ids[input_len:]
275
+ content = processor.decode(output_ids, skip_special_tokens=True).strip()
276
+ except Exception as e:
277
+ print(f"Holo Parsing Error: {e}")
278
+ content = processor.decode(all_ids[input_len:], skip_special_tokens=True).strip()
279
+
280
+ # Parse JSON Content
281
+ actions = []
282
+ try:
283
+ # Holo2 outputs strictly valid JSON usually
284
+ # E.g. {"x": 500, "y": 300, "description": "search bar"}
285
+ # Or {"action": "click", "point": [100, 200]}
286
+ # Flattening to common format
287
+ if "{" in content and "}" in content:
288
+ # Find JSON block if surrounded by text
289
+ json_str = re.search(r"(\{.*\})", content, re.DOTALL).group(1)
290
+ data = json.loads(json_str)
291
+
292
+ x, y = 0, 0
293
+ if "x" in data and "y" in data:
294
+ x, y = data["x"], data["y"]
295
+ elif "point" in data:
296
+ x, y = data["point"][0], data["point"][1]
297
+ elif "coordinate" in data:
298
+ x, y = data["coordinate"][0], data["coordinate"][1]
299
+
300
+ if x or y:
301
+ # Holo2 output is 0-1000 scale
302
+ actions.append({
303
+ "type": "click",
304
+ "x": float(x),
305
+ "y": float(y),
306
+ "text": data.get("description", "") or data.get("text", ""),
307
+ "scale_base": 1000 # Flag to indicate this needs normalization from 1000
308
+ })
309
+ except Exception as e:
310
+ print(f"Holo JSON Parse Failed: {e}")
311
+
312
+ return content, thinking_content, actions
313
+
314
  def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
315
  if not actions: return None
316
  img_copy = original_image.copy()
 
324
  x = act['x']
325
  y = act['y']
326
 
327
+ # Holo2 Special Case (0-1000 scaling)
328
+ if act.get('scale_base') == 1000:
329
+ pixel_x = int((x / 1000) * width)
330
+ pixel_y = int((y / 1000) * height)
331
+ # Normalized (0-1)
332
+ elif x <= 1.0 and y <= 1.0 and x > 0:
333
+ pixel_x = int(x * width)
334
+ pixel_y = int(y * height)
335
+ # Absolute Pixels
336
  else:
337
+ pixel_x = int(x)
338
+ pixel_y = int(y)
339
 
340
  color = 'red' if 'click' in act['type'].lower() else 'blue'
341
 
342
+ # Draw Visuals
343
  r = 15
344
+ draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4)
 
 
 
 
345
  draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
346
 
347
+ # Draw Cross
348
+ draw.line([pixel_x - 10, pixel_y, pixel_x + 10, pixel_y], fill=color, width=2)
349
+ draw.line([pixel_x, pixel_y - 10, pixel_x, pixel_y + 10], fill=color, width=2)
350
+
351
  # Label
352
  label = f"{act['type']}"
353
  if act['text']: label += f": {act['text']}"
354
 
355
  text_pos = (pixel_x + 20, pixel_y - 10)
 
356
  bbox = draw.textbbox(text_pos, label, font=font)
357
  draw.rectangle((bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2), fill="black")
358
  draw.text(text_pos, label, fill="white", font=font)
 
365
 
366
  @spaces.GPU(duration=120)
367
  def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
368
+ if input_numpy_image is None: return "⚠️ Please upload an image.", None, None
369
 
370
  input_pil_image = array_to_image(input_numpy_image)
371
  orig_w, orig_h = input_pil_image.size
372
+ actions = []
373
+ raw_response = ""
374
+ reasoning_text = None
375
 
376
  # --- UI-TARS Logic ---
377
  if model_choice == "UI-TARS-1.5-7B":
378
+ if model_x is None: return "Error: UI-TARS model failed to load.", None, None
379
+ print("Running UI-TARS...")
380
 
 
 
381
  ip_params = get_image_proc_params(processor_x)
382
  resized_h, resized_w = smart_resize(
383
  input_pil_image.height, input_pil_image.width,
 
386
  )
387
  proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
388
 
 
389
  messages = get_uitars_prompt(task, proc_image)
390
  text_prompt = processor_x.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
391
  inputs = processor_x(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
392
  inputs = {k: v.to(device) for k, v in inputs.items()}
393
 
 
394
  with torch.no_grad():
395
  generated_ids = model_x.generate(**inputs, max_new_tokens=128)
396
 
 
397
  generated_ids = [out_ids[len(in_seq):] for in_seq, out_ids in zip(inputs.get("input_ids"), generated_ids)]
398
  raw_response = processor_x.batch_decode(generated_ids, skip_special_tokens=True)[0]
399
 
 
400
  actions = parse_uitars_response(raw_response)
401
 
402
+ # Rescale
 
 
403
  scale_x = orig_w / resized_w
404
  scale_y = orig_h / resized_h
 
405
  for a in actions:
406
  a['x'] = int(a['x'] * scale_x)
407
  a['y'] = int(a['y'] * scale_y)
408
 
409
+ # --- Holo2 Logic ---
410
+ elif model_choice == "Holo2-8B":
411
+ if model_h is None: return "Error: Holo2 model failed to load.", None, None
412
+ print("Running Holo2...")
413
+
414
+ messages = get_holo2_prompt(task, input_pil_image)
415
+ text_prompt = processor_h.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
416
+ image_inputs, video_inputs = process_vision_info(messages)
417
+ inputs = processor_h(text=[text_prompt], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
418
+ inputs = inputs.to(device)
419
+
420
+ with torch.no_grad():
421
+ generated_ids = model_h.generate(**inputs, max_new_tokens=512)
422
+
423
+ # Parse Reasoning + Content
424
+ input_len = len(inputs.input_ids[0])
425
+ content, thinking, parsed_actions = parse_holo2_response(generated_ids, processor_h, input_len)
426
+
427
+ raw_response = content
428
+ reasoning_text = thinking
429
+ actions = parsed_actions
430
+
431
  # --- Fara Logic ---
432
  else:
433
+ if model_v is None: return "Error: Fara model failed to load.", None, None
434
+ print("Running Fara...")
435
  messages = get_fara_prompt(task, input_pil_image)
436
  text_prompt = processor_v.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
437
  image_inputs, video_inputs = process_vision_info(messages)
438
+ inputs = processor_v(text=[text_prompt], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
 
 
 
 
 
 
 
439
  inputs = inputs.to(device)
440
 
441
  with torch.no_grad():
 
443
 
444
  generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
445
  raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
446
  actions = parse_fara_response(raw_response)
447
 
448
+ print(f"Raw: {raw_response}")
449
+ if reasoning_text: print(f"Thinking: {reasoning_text}")
450
 
451
+ # Visualize
452
  output_image = input_pil_image
453
  if actions:
454
  vis = create_localized_image(input_pil_image, actions)
455
  if vis: output_image = vis
456
 
457
+ final_text_output = f"▶️ OUTPUT:\n{raw_response}"
458
+ if reasoning_text:
459
+ final_text_output = f"🧠 THINKING PROCESS:\n{reasoning_text}\n\n" + final_text_output
460
+
461
+ return final_text_output, output_image
462
 
463
  # -----------------------------------------------------------------------------
464
  # 6. UI SETUP
 
474
 
475
  with gr.Row():
476
  model_choice = gr.Radio(
477
+ choices=["Fara-7B", "UI-TARS-1.5-7B", "Holo2-8B"],
478
  label="Select Model",
479
  value="Fara-7B",
480
  interactive=True
 
489
 
490
  with gr.Column(scale=3):
491
  output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
492
+ output_text = gr.Textbox(label="Model Output & Reasoning", lines=12, show_copy_button=True)
493
 
494
  submit_btn.click(
495
  fn=process_screenshot,