prithivMLmods commited on
Commit
f31e251
·
verified ·
1 Parent(s): a60849a

update app

Browse files
Files changed (1) hide show
  1. app.py +339 -545
app.py CHANGED
@@ -3,6 +3,8 @@ import os
3
  import shutil
4
  import time
5
  import uuid
 
 
6
  import unicodedata
7
  from io import BytesIO
8
  from threading import Timer
@@ -13,11 +15,17 @@ import gradio as gr
13
  import torch
14
  import spaces
15
  from dotenv import load_dotenv
16
- from e2b_desktop import Sandbox
17
- from gradio_modal import Modal
18
- from huggingface_hub import login, upload_folder
19
  from PIL import Image, ImageDraw
20
 
 
 
 
 
 
 
 
 
 
21
  # Smolagents imports
22
  from smolagents import CodeAgent, tool, AgentImage
23
  from smolagents.memory import ActionStep, TaskStep
@@ -38,14 +46,12 @@ load_dotenv(override=True)
38
  # CONFIGURATION & CONSTANTS
39
  # -----------------------------------------------------------------------------
40
 
41
- E2B_API_KEY = os.getenv("E2B_API")
42
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
43
  if HF_TOKEN:
 
44
  login(token=HF_TOKEN)
45
 
46
- SANDBOXES = {}
47
- SANDBOX_METADATA = {}
48
- SANDBOX_TIMEOUT = 600
49
  WIDTH = 1024
50
  HEIGHT = 768
51
  TMP_DIR = "./tmp/"
@@ -58,7 +64,6 @@ if not os.path.exists(TMP_DIR):
58
 
59
  print("Loading Fara Model... This may take a moment.")
60
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
61
- # Using the Microsoft Fara model as requested
62
  MODEL_ID_F = "microsoft/Fara-7B"
63
 
64
  # Global model variables
@@ -76,7 +81,7 @@ try:
76
  print(f"Fara Model loaded successfully on {DEVICE}")
77
  except Exception as e:
78
  print(f"Error loading Fara Model: {e}")
79
- print("Falling back to Qwen/Qwen2.5-VL-7B-Instruct for demonstration if Fara is unavailable...")
80
  try:
81
  MODEL_ID_F = "Qwen/Qwen2.5-VL-7B-Instruct"
82
  processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
@@ -98,15 +103,13 @@ except Exception as e:
98
  @spaces.GPU(duration=120)
99
  def run_model_inference(formatted_messages, max_tokens=1024, stop_sequences=None):
100
  """
101
- This function runs on the GPU worker.
102
- It receives simple python objects (lists/dicts), not the complex Agent object.
103
  """
104
  global model_f, processor_f
105
 
106
  if model_f is None:
107
  raise ValueError("Model is not loaded.")
108
 
109
- # Process Inputs (Tokenization happens here to ensure tensors are on correct device)
110
  text = processor_f.apply_chat_template(
111
  formatted_messages, tokenize=False, add_generation_prompt=True
112
  )
@@ -121,10 +124,8 @@ def run_model_inference(formatted_messages, max_tokens=1024, stop_sequences=None
121
  return_tensors="pt",
122
  )
123
 
124
- # Move inputs to the model's device (GPU)
125
  inputs = inputs.to(model_f.device)
126
 
127
- # Generate
128
  with torch.no_grad():
129
  generated_ids = model_f.generate(
130
  **inputs,
@@ -133,7 +134,6 @@ def run_model_inference(formatted_messages, max_tokens=1024, stop_sequences=None
133
  tokenizer=processor_f.tokenizer,
134
  )
135
 
136
- # Decode
137
  generated_ids_trimmed = [
138
  out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
139
  ]
@@ -145,9 +145,6 @@ def run_model_inference(formatted_messages, max_tokens=1024, stop_sequences=None
145
 
146
 
147
  class FaraLocalModel(Model):
148
- """
149
- Wrapper for the local Fara (Qwen2.5-VL) model to work with SmolAgents.
150
- """
151
  def __init__(self, **kwargs):
152
  super().__init__(**kwargs)
153
 
@@ -159,13 +156,9 @@ class FaraLocalModel(Model):
159
  ) -> ChatMessage:
160
 
161
  formatted_messages = []
162
-
163
- # Convert SmolAgents messages to Qwen/Transformers format
164
- # We perform this conversion here (CPU side) to create simple dicts/lists
165
  for msg in messages:
166
  role = msg["role"]
167
  content = msg["content"]
168
-
169
  new_content = []
170
 
171
  if isinstance(content, str):
@@ -177,7 +170,6 @@ class FaraLocalModel(Model):
177
  elif isinstance(item, dict):
178
  if "type" in item:
179
  if item["type"] == "image":
180
- # Handle path or url - extract value to ensure serializability
181
  val = item.get("image") or item.get("url") or item.get("path")
182
  new_content.append({"type": "image", "image": val})
183
  else:
@@ -185,9 +177,6 @@ class FaraLocalModel(Model):
185
 
186
  formatted_messages.append({"role": role, "content": new_content})
187
 
188
- # Call the decorated global function
189
- # This crosses the boundary to the GPU worker safely because
190
- # formatted_messages contains only standard Python types (str, list, dict, PIL.Image)
191
  output_text = run_model_inference(
192
  formatted_messages=formatted_messages,
193
  max_tokens=kwargs.get("max_tokens", 1024),
@@ -200,14 +189,138 @@ class FaraLocalModel(Model):
200
  )
201
 
202
  # -----------------------------------------------------------------------------
203
- # E2B AGENT & TOOLS
204
  # -----------------------------------------------------------------------------
205
 
206
- E2B_SYSTEM_PROMPT_TEMPLATE = """You are a desktop automation assistant that can control a remote desktop environment. The current date is <<current_date>>.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  <action process>
209
  You will be given a task to solve in several steps. At each step you will perform an action.
210
- After each action, you'll receive an updated screenshot.
211
  Then you will proceed as follows, with these sections: don't skip any!
212
 
213
  Short term goal: ...
@@ -222,7 +335,7 @@ Always format your action ('Action:' part) as Python code blocks as shown above.
222
  </action_process>
223
 
224
  <tools>
225
- On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
226
  {%- for tool in tools.values() %}
227
  - {{ tool.name }}: {{ tool.description }}
228
  Takes inputs: {{tool.inputs}}
@@ -231,29 +344,23 @@ On top of performing computations in the Python code snippets that you create, y
231
  </tools>
232
 
233
  <click_guidelines>
234
- Look at elements on the screen to determine what to click or interact with.
235
- The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels, take it into account to decide clicking coordinates. NEVER USE HYPOTHETIC OR ASSUMED COORDINATES, USE TRUE COORDINATES that you can see from the screenshot.
236
- Use precise coordinates based on the current screenshot for mouse movements and clicks.
237
- Whenever you click, MAKE SURE to click in the middle of the button, text, link or any other clickable element. Not under, not on the side. IN THE MIDDLE, else you risk to miss it.
238
- In menus it is always better to click in the middle of the text rather than in the tiny icon. Calculate extremelly well the coordinates. A mistake here can make the full task fail.
239
- Sometimes you may have missed a click, so never assume that you're on the right page, always make sure that your previous action worked.
240
- In the screenshot you will see a green crosshair displayed over the position of your last click: this way can inspect if the mouse pointer is off of the targeted element, pay special attention to it.
241
  </click_guidelines>
242
 
243
  <general_guidelines>
244
- Always analyze the latest screenshot carefully before performing actions.
245
- You can wait for appropriate loading times using the wait() tool. But don't wait forever, sometimes you've just misclicked and the process didn't launch.
246
- Execute one action at a time: don't try to pack a click and typing in one action.
247
- On each step, look at the last screenshot and action to validate if previous steps worked and decide the next action. If you repeated an action already without effect, it means that this action is useless: don't repeat it and try something else.
248
- Use click to move through menus on the desktop and scroll for web and specific applications.
249
- Always analyze the latest screenshot carefully before performing actions.
250
- Desktop menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
251
- NEVER CLICK THE WEB BROWSER ICON TO OPEN THE WEB BROWSER: use open_url directly.
252
- In browser, ignore any sign-in popups while they don't interfere with the elements you want to interact with.
253
  </general_guidelines>
254
  """.replace("<<current_date>>", datetime.now().strftime("%A, %d-%B-%Y"))
255
 
256
-
257
  def draw_marker_on_image(image_copy, click_coordinates):
258
  x, y = click_coordinates
259
  draw = ImageDraw.Draw(image_copy)
@@ -261,385 +368,225 @@ def draw_marker_on_image(image_copy, click_coordinates):
261
  # Draw cross
262
  draw.line((x - cross_size, y, x + cross_size, y), fill="green", width=linewidth)
263
  draw.line((x, y - cross_size, x, y + cross_size), fill="green", width=linewidth)
264
- # Add a circle around it for better visibility
265
  draw.ellipse(
266
- (
267
- x - cross_size * 2,
268
- y - cross_size * 2,
269
- x + cross_size * 2,
270
- y + cross_size * 2,
271
- ),
272
  outline="green",
273
  width=linewidth,
274
  )
275
  return image_copy
276
 
277
- def get_agent_summary_erase_images(agent):
278
- for memory_step in agent.memory.steps:
279
- if hasattr(memory_step, "observations_images"):
280
- memory_step.observations_images = None
281
- if hasattr(memory_step, "task_images"):
282
- memory_step.task_images = None
283
- return agent.write_memory_to_messages()
284
-
285
-
286
- class E2BVisionAgent(CodeAgent):
287
- """Agent for e2b desktop automation with Vision capabilities"""
288
 
289
  def __init__(
290
  self,
291
  model: Model,
292
  data_dir: str,
293
- desktop: Sandbox,
294
- tools: List[tool] = None,
295
- max_steps: int = 200,
296
  verbosity_level: LogLevel = 2,
297
- planning_interval: int = None,
298
- use_v1_prompt: bool = False,
299
  **kwargs,
300
  ):
301
- self.desktop = desktop
302
  self.data_dir = data_dir
303
- self.planning_interval = planning_interval
304
- # Initialize Desktop
305
- self.width, self.height = self.desktop.get_screen_size()
306
- print(f"Screen size: {self.width}x{self.height}")
307
-
308
- # Set up temp directory
309
  os.makedirs(self.data_dir, exist_ok=True)
310
- print(f"Screenshots and steps will be saved to: {self.data_dir}")
311
 
312
- self.use_v1_prompt = use_v1_prompt
313
- # Initialize base agent
314
  super().__init__(
315
- tools=tools or [],
316
  model=model,
317
  max_steps=max_steps,
318
  verbosity_level=verbosity_level,
319
- planning_interval=self.planning_interval,
320
  **kwargs,
321
  )
322
- self.prompt_templates["system_prompt"] = E2B_SYSTEM_PROMPT_TEMPLATE.replace(
323
- "<<resolution_x>>", str(self.width)
324
- ).replace("<<resolution_y>>", str(self.height))
325
-
326
- # Add screen info to state
327
- self.state["screen_width"] = self.width
328
- self.state["screen_height"] = self.height
329
 
330
- # Add default tools
331
- self.logger.log("Setting up agent tools...")
332
- self._setup_desktop_tools()
333
  self.step_callbacks.append(self.take_screenshot_callback)
334
 
335
- def _setup_desktop_tools(self):
336
- """Register all desktop tools"""
337
 
338
  @tool
339
  def click(x: int, y: int) -> str:
340
  """
341
- Performs a left-click at the specified coordinates
342
  Args:
343
- x: The x coordinate (horizontal position)
344
- y: The y coordinate (vertical position)
345
  """
346
- self.desktop.move_mouse(x, y)
347
- self.desktop.left_click()
348
  self.click_coordinates = [x, y]
349
- self.logger.log(f"Clicked at coordinates ({x}, {y})")
350
- return f"Clicked at coordinates ({x}, {y})"
351
 
352
  @tool
353
  def right_click(x: int, y: int) -> str:
354
  """
355
- Performs a right-click at the specified coordinates
356
  Args:
357
- x: The x coordinate (horizontal position)
358
- y: The y coordinate (vertical position)
359
  """
360
- self.desktop.move_mouse(x, y)
361
- self.desktop.right_click()
362
  self.click_coordinates = [x, y]
363
- self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
364
- return f"Right-clicked at coordinates ({x}, {y})"
365
 
366
  @tool
367
  def double_click(x: int, y: int) -> str:
368
  """
369
- Performs a double-click at the specified coordinates
370
  Args:
371
- x: The x coordinate (horizontal position)
372
- y: The y coordinate (vertical position)
373
  """
374
- self.desktop.move_mouse(x, y)
375
- self.desktop.double_click()
376
  self.click_coordinates = [x, y]
377
- self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
378
- return f"Double-clicked at coordinates ({x}, {y})"
379
-
380
- @tool
381
- def move_mouse(x: int, y: int) -> str:
382
- """
383
- Moves the mouse cursor to the specified coordinates
384
- Args:
385
- x: The x coordinate (horizontal position)
386
- y: The y coordinate (vertical position)
387
- """
388
- self.desktop.move_mouse(x, y)
389
- self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
390
- return f"Moved mouse to coordinates ({x}, {y})"
391
-
392
- def normalize_text(text):
393
- return "".join(
394
- c
395
- for c in unicodedata.normalize("NFD", text)
396
- if not unicodedata.combining(c)
397
- )
398
 
399
  @tool
400
  def type_text(text: str) -> str:
401
  """
402
- Types the specified text at the current cursor position.
403
  Args:
404
- text: The text to type
405
  """
406
- clean_text = normalize_text(text)
407
- self.desktop.write(clean_text, delay_in_ms=75)
408
- self.logger.log(f"Typed text: '{clean_text}'")
409
  return f"Typed text: '{clean_text}'"
410
 
411
  @tool
412
  def press_key(key: str) -> str:
413
  """
414
- Presses a keyboard key
415
  Args:
416
- key: The key to press (e.g. "enter", "space", "backspace", etc.).
417
  """
418
- self.desktop.press(key)
419
- self.logger.log(f"Pressed key: {key}")
420
  return f"Pressed key: {key}"
421
 
422
- @tool
423
- def go_back() -> str:
424
- """
425
- Goes back to the previous page in the browser.
426
- """
427
- self.desktop.press(["alt", "left"])
428
- self.logger.log("Went back one page")
429
- return "Went back one page"
430
-
431
  @tool
432
  def drag_and_drop(x1: int, y1: int, x2: int, y2: int) -> str:
433
  """
434
- Clicks [x1, y1], drags mouse to [x2, y2], then release click.
435
  Args:
436
- x1: The x coordinate of the start position.
437
- y1: The y coordinate of the start position.
438
- x2: The x coordinate of the end position.
439
- y2: The y coordinate of the end position.
440
  """
441
- self.desktop.drag([x1, y1], [x2, y2])
442
- message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
443
- self.logger.log(message)
444
- return message
445
 
446
  @tool
447
- def scroll(x: int, y: int, direction: str = "down", amount: int = 2) -> str:
448
  """
449
- Moves the mouse to selected coordinates, then uses the scroll button.
450
  Args:
451
- x: The x coordinate
452
- y: The y coordinate
453
- direction: "up" or "down"
454
- amount: The amount to scroll.
455
  """
456
- self.desktop.move_mouse(x, y)
457
- self.desktop.scroll(direction=direction, amount=amount)
458
- message = f"Scrolled {direction} by {amount}"
459
- self.logger.log(message)
460
- return message
461
 
462
  @tool
463
  def wait(seconds: float) -> str:
464
  """
465
  Waits for the specified number of seconds.
466
  Args:
467
- seconds: The duration to wait in seconds.
468
  """
469
  time.sleep(seconds)
470
- self.logger.log(f"Waited for {seconds} seconds")
471
  return f"Waited for {seconds} seconds"
472
 
473
  @tool
474
  def open_url(url: str) -> str:
475
  """
476
- Directly opens a browser with the specified url.
477
  Args:
478
- url: The website URL to open.
479
  """
480
  if not url.startswith(("http://", "https://")):
481
  url = "https://" + url
482
-
483
- self.desktop.open(url)
484
- time.sleep(2)
485
- self.logger.log(f"Opening URL: {url}")
486
- return f"Opened URL: {url}"
 
 
487
 
488
  @tool
489
- def find_on_page_ctrl_f(search_string: str) -> str:
490
  """
491
- Scroll the browser viewport to the first occurrence of the search string (Ctrl+F).
492
- Args:
493
- search_string: The text to search for on the page.
494
  """
495
- self.desktop.press(["ctrl", "f"])
496
- time.sleep(0.3)
497
- clean_text = normalize_text(search_string)
498
- self.desktop.write(clean_text, delay_in_ms=75)
499
- time.sleep(0.3)
500
- self.desktop.press("enter")
501
- time.sleep(0.3)
502
- self.desktop.press("esc")
503
- output_message = f"Scrolled to the first occurrence of '{clean_text}'"
504
- self.logger.log(output_message)
505
- return output_message
506
-
507
- # Register the tools
508
  self.tools["click"] = click
509
  self.tools["right_click"] = right_click
510
  self.tools["double_click"] = double_click
511
- self.tools["move_mouse"] = move_mouse
512
  self.tools["type_text"] = type_text
513
  self.tools["press_key"] = press_key
 
514
  self.tools["scroll"] = scroll
515
  self.tools["wait"] = wait
516
  self.tools["open_url"] = open_url
517
  self.tools["go_back"] = go_back
518
- self.tools["drag_and_drop"] = drag_and_drop
519
- self.tools["find_on_page_ctrl_f"] = find_on_page_ctrl_f
520
 
521
- def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
522
- """Callback that takes a screenshot + memory snapshot after a step completes"""
523
- self.logger.log("Analyzing screen content...")
524
 
 
 
525
  current_step = memory_step.step_number
 
526
 
527
- time.sleep(2.5) # Let things happen on the desktop
528
- screenshot_bytes = self.desktop.screenshot(format="bytes")
529
- image = Image.open(BytesIO(screenshot_bytes))
530
 
531
- # Create a filename with step number
532
  screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
533
  image.save(screenshot_path)
534
 
535
  image_copy = image.copy()
536
-
537
  if getattr(self, "click_coordinates", None):
538
  image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
539
 
540
  self.last_marked_screenshot = AgentImage(screenshot_path)
541
- print(f"Saved screenshot for step {current_step} to {screenshot_path}")
542
-
543
- # Optimization: remove previous raw images from memory to save context/speed
544
  for previous_memory_step in agent.memory.steps:
545
- if (
546
- isinstance(previous_memory_step, ActionStep)
547
- and previous_memory_step.step_number <= current_step - 1
548
- ):
549
  previous_memory_step.observations_images = None
550
  elif isinstance(previous_memory_step, TaskStep):
551
  previous_memory_step.task_images = None
552
 
553
- # Add the marker-edited image to the current memory step
554
  memory_step.observations_images = [image_copy]
555
- self.click_coordinates = None # Reset click marker
556
-
557
-
558
- # -----------------------------------------------------------------------------
559
- # SANDBOX MANAGEMENT & HELPERS
560
- # -----------------------------------------------------------------------------
561
-
562
- def upload_to_hf_and_remove(folder_path):
563
- repo_id = "smolagents/computer-agent-logs"
564
- try:
565
- folder_name = os.path.basename(os.path.normpath(folder_path))
566
- print(f"Uploading {folder_path} to {repo_id}/{folder_name}...")
567
- url = upload_folder(
568
- folder_path=folder_path,
569
- repo_id=repo_id,
570
- repo_type="dataset",
571
- path_in_repo=folder_name,
572
- ignore_patterns=[".git/*", ".gitignore"],
573
- )
574
- print(f"Upload complete. Removing local folder {folder_path}...")
575
- shutil.rmtree(folder_path)
576
- return url
577
- except Exception as e:
578
- print(f"Error during upload or cleanup: {str(e)}")
579
- # Don't raise, just log, to keep app running
580
- return None
581
 
582
- def cleanup_sandboxes():
583
- current_time = time.time()
584
- sandboxes_to_remove = []
585
 
586
- for session_id, metadata in SANDBOX_METADATA.items():
587
- if current_time - metadata["last_accessed"] > SANDBOX_TIMEOUT:
588
- sandboxes_to_remove.append(session_id)
589
-
590
- for session_id in sandboxes_to_remove:
591
- if session_id in SANDBOXES:
592
- try:
593
- data_dir = os.path.join(TMP_DIR, session_id)
594
- if os.path.exists(data_dir):
595
- shutil.rmtree(data_dir) # Just local cleanup for this demo
596
-
597
- SANDBOXES[session_id].kill()
598
- del SANDBOXES[session_id]
599
- del SANDBOX_METADATA[session_id]
600
- print(f"Cleaned up sandbox for session {session_id}")
601
- except Exception as e:
602
- print(f"Error cleaning up sandbox {session_id}: {str(e)}")
603
-
604
- def get_or_create_sandbox(session_uuid):
605
- current_time = time.time()
606
-
607
- if (
608
- session_uuid in SANDBOXES
609
- and session_uuid in SANDBOX_METADATA
610
- and current_time - SANDBOX_METADATA[session_uuid]["created_at"]
611
- < SANDBOX_TIMEOUT
612
- ):
613
- print(f"Reusing Sandbox for {session_uuid}")
614
- SANDBOX_METADATA[session_uuid]["last_accessed"] = current_time
615
- return SANDBOXES[session_uuid]
616
- else:
617
- print("No sandbox found, creating a new one")
618
-
619
- if session_uuid in SANDBOXES:
620
- try:
621
- SANDBOXES[session_uuid].kill()
622
- except Exception:
623
- pass
624
-
625
- print(f"Creating new sandbox for session {session_uuid}")
626
- desktop = Sandbox(
627
- api_key=E2B_API_KEY,
628
- resolution=(WIDTH, HEIGHT),
629
- dpi=96,
630
- timeout=SANDBOX_TIMEOUT,
631
- template="k0wmnzir0zuzye6dndlw",
632
  )
633
- desktop.stream.start(require_auth=True)
634
- setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
635
- desktop.commands.run(setup_cmd)
636
 
637
- SANDBOXES[session_uuid] = desktop
638
- SANDBOX_METADATA[session_uuid] = {
639
- "created_at": current_time,
640
- "last_accessed": current_time,
641
- }
642
- return desktop
 
 
 
 
643
 
644
  def save_final_status(folder, status: str, summary, error_message=None) -> None:
645
  try:
@@ -653,142 +600,24 @@ def save_final_status(folder, status: str, summary, error_message=None) -> None:
653
  except Exception as e:
654
  print(f"Failed to save metadata: {e}")
655
 
656
- def create_agent(data_dir, desktop):
657
- # Initialize the wrapper that calls the global GPU function
658
- model = FaraLocalModel()
659
-
660
- return E2BVisionAgent(
661
- model=model,
662
- data_dir=data_dir,
663
- desktop=desktop,
664
- max_steps=200,
665
- verbosity_level=2,
666
- use_v1_prompt=True,
667
- )
668
-
669
- def generate_interaction_id(session_uuid):
670
- return f"{session_uuid}_{int(time.time())}"
671
-
672
  # -----------------------------------------------------------------------------
673
- # GRADIO UI & INTERACTION
674
  # -----------------------------------------------------------------------------
675
 
 
676
  custom_css = """
677
  .modal-container { margin: var(--size-16) auto!important; }
678
- .sandbox-container { position: relative; width: 910px; height: 800px; overflow: hidden; margin: auto; }
679
- .sandbox-frame { display: none; position: absolute; top: 0; left: 0; width: 910px; height: 800px; pointer-events:none; }
680
- .sandbox-iframe, .bsod-image { position: absolute; width: <<WIDTH>>px; height: <<HEIGHT>>px; border: 4px solid #444444; transform-origin: 0 0; }
681
- .primary-color-label label span { font-weight: bold; color: var(--color-accent); }
682
- .status-bar { display: flex; flex-direction: row; align-items: center; z-index: 100; }
683
- .status-indicator { width: 15px; height: 15px; border-radius: 50%; }
684
- .status-text { font-size: 16px; font-weight: bold; padding-left: 8px; text-shadow: none; }
685
- .status-interactive { background-color: #2ecc71; animation: blink 2s infinite; }
686
- .status-view-only { background-color: #e74c3c; }
687
- .status-error { background-color: #e74c3c; animation: blink-error 1s infinite; }
688
- @keyframes blink-error { 0% { background-color: rgba(231, 76, 60, 1); } 50% { background-color: rgba(231, 76, 60, 0.4); } 100% { background-color: rgba(231, 76, 60, 1); } }
689
- @keyframes blink { 0% { background-color: rgba(46, 204, 113, 1); } 50% { background-color: rgba(46, 204, 113, 0.4); } 100% { background-color: rgba(46, 204, 113, 1); } }
690
- #chatbot { height:1000px!important; }
691
- #chatbot .role { max-width:95% }
692
- .logo-container { display: flex; flex-direction: column; align-items: flex-start; gap: 5px; }
693
- .logo-item { display: flex; align-items: center; padding: 0 30px; gap: 10px; text-decoration: none!important; color: #f59e0b; font-size:17px; }
694
- """.replace("<<WIDTH>>", str(WIDTH + 15)).replace("<<HEIGHT>>", str(HEIGHT + 10))
695
-
696
- sandbox_html_template = """
697
- <style>
698
- @import url('https://fonts.googleapis.com/css2?family=Oxanium:wght@200..800&display=swap');
699
- </style>
700
- <h1 style="color:var(--color-accent);margin:0;">Fara CUA - <i>Powered by <a href="https://github.com/huggingface/smolagents">smolagents</a></i><h1>
701
- <div class="sandbox-container" style="margin:0;">
702
- <div class="status-bar">
703
- <div class="status-indicator {status_class}"></div>
704
- <div class="status-text">{status_text}</div>
705
- </div>
706
- <iframe id="sandbox-iframe"
707
- src="{stream_url}"
708
- class="sandbox-iframe"
709
- style="display: block;"
710
- allowfullscreen>
711
- </iframe>
712
- <img src="https://huggingface.co/datasets/mfarre/servedfiles/resolve/main/blue_screen_of_death.gif" class="bsod-image" style="display: none;"/>
713
- <img src="https://huggingface.co/datasets/m-ric/images/resolve/main/HUD_thom.png" class="sandbox-frame" />
714
  </div>
715
- """.replace("<<WIDTH>>", str(WIDTH + 15)).replace("<<HEIGHT>>", str(HEIGHT + 10))
716
-
717
- custom_js = """function() {
718
- document.body.classList.add('dark');
719
- // Function to check if sandbox is timing out
720
- const checkSandboxTimeout = function() {
721
- const timeElement = document.getElementById('sandbox-creation-time');
722
- if (timeElement) {
723
- const creationTime = parseFloat(timeElement.getAttribute('data-time'));
724
- const timeoutValue = parseFloat(timeElement.getAttribute('data-timeout'));
725
- const currentTime = Math.floor(Date.now() / 1000);
726
- if (currentTime - creationTime >= timeoutValue) {
727
- showBSOD('Error');
728
- return;
729
- }
730
- }
731
- setTimeout(checkSandboxTimeout, 5000);
732
- };
733
- const showBSOD = function(statusText = 'Error') {
734
- const iframe = document.getElementById('sandbox-iframe');
735
- const bsod = document.getElementById('bsod-image');
736
- if (iframe && bsod) {
737
- iframe.style.display = 'none';
738
- bsod.style.display = 'block';
739
- document.querySelector('.status-indicator').className = 'status-indicator status-error';
740
- document.querySelector('.status-text').innerText = statusText;
741
- }
742
- };
743
- const resetBSOD = function() {
744
- const iframe = document.getElementById('sandbox-iframe');
745
- const bsod = document.getElementById('bsod-image');
746
- if (iframe && bsod && bsod.style.display === 'block') {
747
- iframe.style.display = 'block';
748
- bsod.style.display = 'none';
749
- }
750
- };
751
- document.addEventListener('click', function(e) {
752
- if (e.target.tagName === 'BUTTON' && e.target.innerText === "Let's go!") {
753
- resetBSOD();
754
- }
755
- });
756
- checkSandboxTimeout();
757
- const params = new URLSearchParams(window.location.search);
758
- if (!params.has('__theme')) {
759
- params.set('__theme', 'dark');
760
- window.location.search = params.toString();
761
- }
762
- }"""
763
-
764
- def update_html(interactive_mode: bool, session_uuid):
765
- desktop = get_or_create_sandbox(session_uuid)
766
- auth_key = desktop.stream.get_auth_key()
767
- base_url = desktop.stream.get_url(auth_key=auth_key)
768
- stream_url = base_url if interactive_mode else f"{base_url}&view_only=true"
769
-
770
- status_class = "status-interactive" if interactive_mode else "status-view-only"
771
- status_text = "Interactive" if interactive_mode else "Agent running..."
772
- creation_time = (
773
- SANDBOX_METADATA[session_uuid]["created_at"]
774
- if session_uuid in SANDBOX_METADATA
775
- else time.time()
776
- )
777
-
778
- sandbox_html_content = sandbox_html_template.format(
779
- stream_url=stream_url,
780
- status_class=status_class,
781
- status_text=status_text,
782
- )
783
- sandbox_html_content += f'<div id="sandbox-creation-time" style="display:none;" data-time="{creation_time}" data-timeout="{SANDBOX_TIMEOUT}"></div>'
784
- return sandbox_html_content
785
-
786
- def initialize_session(interactive_mode, browser_uuid):
787
- if not browser_uuid:
788
- new_uuid = str(uuid.uuid4())
789
- return update_html(interactive_mode, new_uuid), new_uuid
790
- else:
791
- return update_html(interactive_mode, browser_uuid), browser_uuid
792
 
793
  class EnrichedGradioUI(GradioUI):
794
  def interact_with_agent(
@@ -801,35 +630,30 @@ class EnrichedGradioUI(GradioUI):
801
  request: gr.Request,
802
  ):
803
  interaction_id = generate_interaction_id(session_uuid)
804
- desktop = get_or_create_sandbox(session_uuid)
805
-
806
  data_dir = os.path.join(TMP_DIR, interaction_id)
807
- if not os.path.exists(data_dir):
808
- os.makedirs(data_dir)
809
-
810
- # Create fresh agent.
811
- # Note: We do NOT store the full agent in session_state passed between Gradio events
812
- # if possible, or if we do, we ensure this function isn't wrapped in @spaces.GPU
813
- agent = create_agent(data_dir=data_dir, desktop=desktop)
814
- session_state["agent"] = agent # Storing in state is fine if this function runs on CPU
815
 
816
  try:
817
  stored_messages.append(gr.ChatMessage(role="user", content=task_input))
818
- yield stored_messages
819
-
820
- screenshot_bytes = agent.desktop.screenshot(format="bytes")
821
- initial_screenshot = Image.open(BytesIO(screenshot_bytes))
822
 
 
 
 
 
823
  for msg in stream_to_gradio(
824
  agent,
825
  task=task_input,
826
- task_images=[initial_screenshot],
827
  reset_agent_memory=False,
828
  ):
829
- if (
830
- hasattr(agent, "last_marked_screenshot")
831
- and msg.content == "-----"
832
- ):
833
  stored_messages.append(
834
  gr.ChatMessage(
835
  role="assistant",
@@ -839,13 +663,17 @@ class EnrichedGradioUI(GradioUI):
839
  },
840
  )
841
  )
842
- stored_messages.append(msg)
843
- yield stored_messages
 
 
 
844
 
845
  if consent_storage:
846
  summary = get_agent_summary_erase_images(agent)
847
  save_final_status(data_dir, "completed", summary=summary)
848
- yield stored_messages
 
849
 
850
  except Exception as e:
851
  error_message = f"Error in interaction: {str(e)}"
@@ -853,121 +681,87 @@ class EnrichedGradioUI(GradioUI):
853
  stored_messages.append(
854
  gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message)
855
  )
856
- if consent_storage:
857
- save_final_status(data_dir, "failed", summary=None, error_message=error_message)
858
- yield stored_messages
859
-
860
- # -----------------------------------------------------------------------------
861
- # MAIN APP CONSTRUCTION
862
- # -----------------------------------------------------------------------------
863
 
 
864
  theme = gr.themes.Default(
865
  font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue"
866
  )
867
 
868
- with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
869
- session_uuid_state = gr.State(None)
 
 
870
 
871
  with gr.Row():
872
- sandbox_html = gr.HTML(
873
- value=sandbox_html_template.format(
874
- stream_url="",
875
- status_class="status-interactive",
876
- status_text="Interactive",
877
- ),
878
- label="Output",
879
- )
880
- with gr.Sidebar(position="left"):
881
- with Modal(visible=True) as modal:
882
- gr.Markdown("""### Welcome to Fara CUA Demo 🖥️
883
- This agent uses **microsoft/Fara-7B** (running locally via ZeroGPU) and **smolagents** to control a remote computer.
884
-
885
- 👉 Type a task, click 'Let's go!', and watch the agent work.
886
- """)
887
  task_input = gr.Textbox(
888
- value="Find me pictures of cute puppies",
889
- label="Enter your task below:",
890
- elem_classes="primary-color-label",
891
  )
892
-
893
- run_btn = gr.Button("Let's go!", variant="primary")
894
 
895
- # Simple controls
896
- stop_btn = gr.Button("Stop the agent!", variant="secondary")
897
- consent_storage = gr.Checkbox(label="Store logs locally?", value=True)
898
 
899
  gr.Examples(
900
  examples=[
901
- "Use Google Maps to find the Hugging Face HQ in Paris",
902
- "Go to Wikipedia and find what happened on April 4th",
903
- "Find out the travel time by train from Bern to Basel on Google Maps",
904
  ],
905
- inputs=task_input,
906
  )
907
 
908
- session_state = gr.State({})
909
- stored_messages = gr.State([])
910
-
911
- chatbot_display = gr.Chatbot(
912
- elem_id="chatbot",
913
- label="Agent's execution logs",
914
- type="messages",
915
- avatar_images=(None, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png"),
916
- resizable=True,
917
- )
 
 
 
 
 
 
 
 
 
918
 
919
- # Dummy agent init for UI wrapper (actual agent created in interaction loop)
920
- # We pass a dummy CodeAgent just to initialize the UI class
921
  agent_ui = EnrichedGradioUI(CodeAgent(tools=[], model=Model(), name="init"))
922
 
923
- is_interactive = gr.Checkbox(value=True, visible=False)
924
-
925
- def clear_and_set_view_only(task_input, session_uuid):
926
- return update_html(False, session_uuid)
927
-
928
- def set_interactive(session_uuid):
929
- return update_html(True, session_uuid)
930
-
931
  def interrupt_agent(session_state):
932
- if "agent" in session_state and hasattr(session_state["agent"], "interrupt_switch") and not session_state["agent"].interrupt_switch:
933
- session_state["agent"].interrupt()
934
- return "Stopped"
935
- return "Stop"
936
-
937
- # Event Wiring
938
- run_event = (
939
- run_btn.click(
940
- fn=clear_and_set_view_only,
941
- inputs=[task_input, session_uuid_state],
942
- outputs=[sandbox_html],
943
- )
944
- .then(
945
- agent_ui.interact_with_agent,
946
- inputs=[
947
- task_input,
948
- stored_messages,
949
- session_state,
950
- session_uuid_state,
951
- consent_storage,
952
- ],
953
- outputs=[chatbot_display],
954
- )
955
- .then(fn=set_interactive, inputs=[session_uuid_state], outputs=[sandbox_html])
956
  )
957
 
958
  stop_btn.click(fn=interrupt_agent, inputs=[session_state], outputs=[])
959
 
960
- # Initialization on load
961
- demo.load(
962
- fn=lambda: True,
963
- outputs=[is_interactive],
964
- ).then(
965
- fn=initialize_session,
966
- js="() => localStorage.getItem('gradio-session-uuid') || (() => { const id = self.crypto.randomUUID(); localStorage.setItem('gradio-session-uuid', id); return id })()",
967
- inputs=[is_interactive],
968
- outputs=[sandbox_html, session_uuid_state],
969
- )
970
-
971
  if __name__ == "__main__":
972
- Timer(60, cleanup_sandboxes).start()
973
  demo.launch()
 
3
  import shutil
4
  import time
5
  import uuid
6
+ import tempfile
7
+ import atexit
8
  import unicodedata
9
  from io import BytesIO
10
  from threading import Timer
 
15
  import torch
16
  import spaces
17
  from dotenv import load_dotenv
 
 
 
18
  from PIL import Image, ImageDraw
19
 
20
+ # Selenium Imports
21
+ from selenium import webdriver
22
+ from selenium.webdriver.chrome.service import Service as ChromeService
23
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
24
+ from selenium.webdriver.common.action_chains import ActionChains
25
+ from selenium.webdriver.common.by import By
26
+ from selenium.webdriver.common.keys import Keys
27
+ from webdriver_manager.chrome import ChromeDriverManager
28
+
29
  # Smolagents imports
30
  from smolagents import CodeAgent, tool, AgentImage
31
  from smolagents.memory import ActionStep, TaskStep
 
46
  # CONFIGURATION & CONSTANTS
47
  # -----------------------------------------------------------------------------
48
 
 
49
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
50
  if HF_TOKEN:
51
+ from huggingface_hub import login
52
  login(token=HF_TOKEN)
53
 
54
+ # Browser Sandbox Config
 
 
55
  WIDTH = 1024
56
  HEIGHT = 768
57
  TMP_DIR = "./tmp/"
 
64
 
65
  print("Loading Fara Model... This may take a moment.")
66
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
67
  MODEL_ID_F = "microsoft/Fara-7B"
68
 
69
  # Global model variables
 
81
  print(f"Fara Model loaded successfully on {DEVICE}")
82
  except Exception as e:
83
  print(f"Error loading Fara Model: {e}")
84
+ print("Falling back to Qwen/Qwen2.5-VL-7B-Instruct...")
85
  try:
86
  MODEL_ID_F = "Qwen/Qwen2.5-VL-7B-Instruct"
87
  processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
 
103
  @spaces.GPU(duration=120)
104
  def run_model_inference(formatted_messages, max_tokens=1024, stop_sequences=None):
105
  """
106
+ Runs inference on the GPU worker.
 
107
  """
108
  global model_f, processor_f
109
 
110
  if model_f is None:
111
  raise ValueError("Model is not loaded.")
112
 
 
113
  text = processor_f.apply_chat_template(
114
  formatted_messages, tokenize=False, add_generation_prompt=True
115
  )
 
124
  return_tensors="pt",
125
  )
126
 
 
127
  inputs = inputs.to(model_f.device)
128
 
 
129
  with torch.no_grad():
130
  generated_ids = model_f.generate(
131
  **inputs,
 
134
  tokenizer=processor_f.tokenizer,
135
  )
136
 
 
137
  generated_ids_trimmed = [
138
  out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
139
  ]
 
145
 
146
 
147
  class FaraLocalModel(Model):
 
 
 
148
  def __init__(self, **kwargs):
149
  super().__init__(**kwargs)
150
 
 
156
  ) -> ChatMessage:
157
 
158
  formatted_messages = []
 
 
 
159
  for msg in messages:
160
  role = msg["role"]
161
  content = msg["content"]
 
162
  new_content = []
163
 
164
  if isinstance(content, str):
 
170
  elif isinstance(item, dict):
171
  if "type" in item:
172
  if item["type"] == "image":
 
173
  val = item.get("image") or item.get("url") or item.get("path")
174
  new_content.append({"type": "image", "image": val})
175
  else:
 
177
 
178
  formatted_messages.append({"role": role, "content": new_content})
179
 
 
 
 
180
  output_text = run_model_inference(
181
  formatted_messages=formatted_messages,
182
  max_tokens=kwargs.get("max_tokens", 1024),
 
189
  )
190
 
191
  # -----------------------------------------------------------------------------
192
+ # SELENIUM CHROME SANDBOX
193
  # -----------------------------------------------------------------------------
194
 
195
+ class SeleniumSandbox:
196
+ def __init__(self, width=1024, height=768):
197
+ self.width = width
198
+ self.height = height
199
+ self.tmp_dir = tempfile.mkdtemp(prefix="chrome_sandbox_")
200
+
201
+ # Setup Chrome Options
202
+ chrome_opts = ChromeOptions()
203
+ chrome_opts.add_argument("--headless=new") # Run headless for Space compatibility
204
+ chrome_opts.add_argument(f"--user-data-dir={self.tmp_dir}")
205
+ chrome_opts.add_argument(f"--window-size={width},{height}")
206
+ chrome_opts.add_argument("--no-sandbox")
207
+ chrome_opts.add_argument("--disable-dev-shm-usage")
208
+ chrome_opts.add_argument("--disable-extensions")
209
+ chrome_opts.add_argument("--disable-gpu")
210
+ chrome_opts.add_argument("--force-device-scale-factor=1")
211
+ chrome_opts.add_argument("--hide-scrollbars")
212
+
213
+ # Initialize Driver
214
+ try:
215
+ self.driver = webdriver.Chrome(
216
+ service=ChromeService(ChromeDriverManager().install()),
217
+ options=chrome_opts
218
+ )
219
+ # Ensure precise viewport size (excluding chrome UI overhead if any)
220
+ self.driver.set_window_size(width, height)
221
+
222
+ # Start with a blank page
223
+ self.driver.get("about:blank")
224
+ print(f"Selenium Chrome Driver started. Data dir: {self.tmp_dir}")
225
+ except Exception as e:
226
+ print(f"Failed to initialize Selenium: {e}")
227
+ self.cleanup()
228
+ raise e
229
+
230
+ def get_screenshot(self):
231
+ """Returns screenshot as PIL Image"""
232
+ png_data = self.driver.get_screenshot_as_png()
233
+ return Image.open(BytesIO(png_data))
234
+
235
+ def move_mouse_and_click(self, x, y, click_type="left"):
236
+ """
237
+ Simulate mouse movement and click using ActionChains.
238
+ Note: Selenium tracks state, so we move relative to the 'body' tag to ensure absolute positioning simulation.
239
+ """
240
+ try:
241
+ body = self.driver.find_element(By.TAG_NAME, "body")
242
+ actions = ActionChains(self.driver)
243
+
244
+ # Move to 0,0 of body, then offset
245
+ actions.move_to_element_with_offset(body, 0, 0)
246
+ actions.move_by_offset(x, y)
247
+
248
+ if click_type == "left":
249
+ actions.click()
250
+ elif click_type == "right":
251
+ actions.context_click()
252
+ elif click_type == "double":
253
+ actions.double_click()
254
+
255
+ actions.perform()
256
+ except Exception as e:
257
+ print(f"Error in move_mouse_and_click: {e}")
258
+
259
+ def drag_and_drop(self, x1, y1, x2, y2):
260
+ try:
261
+ body = self.driver.find_element(By.TAG_NAME, "body")
262
+ actions = ActionChains(self.driver)
263
+
264
+ actions.move_to_element_with_offset(body, 0, 0)
265
+ actions.move_by_offset(x1, y1)
266
+ actions.click_and_hold()
267
+
268
+ # Move relative from x1,y1 to x2,y2
269
+ actions.move_by_offset(x2 - x1, y2 - y1)
270
+ actions.release()
271
+ actions.perform()
272
+ except Exception as e:
273
+ print(f"Error in drag_and_drop: {e}")
274
+
275
+ def type_text(self, text):
276
+ actions = ActionChains(self.driver)
277
+ actions.send_keys(text)
278
+ actions.perform()
279
+
280
+ def press_key(self, key_name):
281
+ try:
282
+ k = getattr(Keys, key_name.upper(), None)
283
+ if not k:
284
+ # Handle common overrides
285
+ if key_name.lower() == "enter": k = Keys.ENTER
286
+ elif key_name.lower() == "space": k = Keys.SPACE
287
+ elif key_name.lower() == "backspace": k = Keys.BACK_SPACE
288
+ elif key_name.lower() == "esc": k = Keys.ESCAPE
289
+ else: k = key_name # Fallback to literal
290
+
291
+ actions = ActionChains(self.driver)
292
+ actions.send_keys(k)
293
+ actions.perform()
294
+ except Exception as e:
295
+ print(f"Error pressing key: {e}")
296
+
297
+ def scroll(self, amount, direction="down"):
298
+ # Selenium doesn't have a great scroll wheel primitive, use JS
299
+ try:
300
+ scroll_y = amount * 100 # Arbitrary multiplier to match "notches"
301
+ if direction == "up":
302
+ scroll_y = -scroll_y
303
+ self.driver.execute_script(f"window.scrollBy(0, {scroll_y});")
304
+ except Exception as e:
305
+ print(f"Error scrolling: {e}")
306
+
307
+ def cleanup(self):
308
+ try:
309
+ if hasattr(self, 'driver'):
310
+ self.driver.quit()
311
+ except:
312
+ pass
313
+ shutil.rmtree(self.tmp_dir, ignore_errors=True)
314
+
315
+ # -----------------------------------------------------------------------------
316
+ # AGENT SETUP
317
+ # -----------------------------------------------------------------------------
318
+
319
+ SYSTEM_PROMPT_TEMPLATE = """You are a browser automation assistant controlling a Google Chrome web browser. The current date is <<current_date>>.
320
 
321
  <action process>
322
  You will be given a task to solve in several steps. At each step you will perform an action.
323
+ After each action, you'll receive an updated screenshot of the browser.
324
  Then you will proceed as follows, with these sections: don't skip any!
325
 
326
  Short term goal: ...
 
335
  </action_process>
336
 
337
  <tools>
338
+ On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the browser:
339
  {%- for tool in tools.values() %}
340
  - {{ tool.name }}: {{ tool.description }}
341
  Takes inputs: {{tool.inputs}}
 
344
  </tools>
345
 
346
  <click_guidelines>
347
+ The browser has a resolution of <<resolution_x>>x<<resolution_y>> pixels.
348
+ NEVER USE HYPOTHETIC OR ASSUMED COORDINATES, USE TRUE COORDINATES that you can see from the screenshot.
349
+ Use precise coordinates based on the current screenshot.
350
+ Whenever you click, MAKE SURE to click in the middle of the button, text, link or any other clickable element.
351
+ In the screenshot you will see a green crosshair displayed over the position of your last click.
 
 
352
  </click_guidelines>
353
 
354
  <general_guidelines>
355
+ Execute one action at a time.
356
+ Use `open_url` to navigate to websites.
357
+ Use `click` to navigate links and interface elements.
358
+ Use `type_text` to input into forms.
359
+ Use `scroll` to see more content.
360
+ If you get stuck, try using `open_url` to search on Google.
 
 
 
361
  </general_guidelines>
362
  """.replace("<<current_date>>", datetime.now().strftime("%A, %d-%B-%Y"))
363
 
 
364
  def draw_marker_on_image(image_copy, click_coordinates):
365
  x, y = click_coordinates
366
  draw = ImageDraw.Draw(image_copy)
 
368
  # Draw cross
369
  draw.line((x - cross_size, y, x + cross_size, y), fill="green", width=linewidth)
370
  draw.line((x, y - cross_size, x, y + cross_size), fill="green", width=linewidth)
 
371
  draw.ellipse(
372
+ (x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2),
 
 
 
 
 
373
  outline="green",
374
  width=linewidth,
375
  )
376
  return image_copy
377
 
378
+ class SeleniumVisionAgent(CodeAgent):
379
+ """Agent for Browser automation with Selenium and Vision"""
 
 
 
 
 
 
 
 
 
380
 
381
  def __init__(
382
  self,
383
  model: Model,
384
  data_dir: str,
385
+ sandbox: SeleniumSandbox,
386
+ max_steps: int = 20,
 
387
  verbosity_level: LogLevel = 2,
 
 
388
  **kwargs,
389
  ):
390
+ self.sandbox = sandbox
391
  self.data_dir = data_dir
392
+
393
+ # Initialize
394
+ print(f"Browser size: {self.sandbox.width}x{self.sandbox.height}")
 
 
 
395
  os.makedirs(self.data_dir, exist_ok=True)
 
396
 
 
 
397
  super().__init__(
398
+ tools=[],
399
  model=model,
400
  max_steps=max_steps,
401
  verbosity_level=verbosity_level,
 
402
  **kwargs,
403
  )
404
+
405
+ self.prompt_templates["system_prompt"] = SYSTEM_PROMPT_TEMPLATE.replace(
406
+ "<<resolution_x>>", str(self.sandbox.width)
407
+ ).replace("<<resolution_y>>", str(self.sandbox.height))
 
 
 
408
 
409
+ self.register_tools()
 
 
410
  self.step_callbacks.append(self.take_screenshot_callback)
411
 
412
+ def register_tools(self):
 
413
 
414
  @tool
415
  def click(x: int, y: int) -> str:
416
  """
417
+ Performs a left-click at the specified coordinates.
418
  Args:
419
+ x: The x coordinate (horizontal position).
420
+ y: The y coordinate (vertical position).
421
  """
422
+ self.sandbox.move_mouse_and_click(x, y, "left")
 
423
  self.click_coordinates = [x, y]
424
+ return f"Clicked at ({x}, {y})"
 
425
 
426
  @tool
427
  def right_click(x: int, y: int) -> str:
428
  """
429
+ Performs a right-click at the specified coordinates.
430
  Args:
431
+ x: The x coordinate.
432
+ y: The y coordinate.
433
  """
434
+ self.sandbox.move_mouse_and_click(x, y, "right")
 
435
  self.click_coordinates = [x, y]
436
+ return f"Right-clicked at ({x}, {y})"
 
437
 
438
  @tool
439
  def double_click(x: int, y: int) -> str:
440
  """
441
+ Performs a double-click at the specified coordinates.
442
  Args:
443
+ x: The x coordinate.
444
+ y: The y coordinate.
445
  """
446
+ self.sandbox.move_mouse_and_click(x, y, "double")
 
447
  self.click_coordinates = [x, y]
448
+ return f"Double-clicked at ({x}, {y})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
  @tool
451
  def type_text(text: str) -> str:
452
  """
453
+ Types the specified text.
454
  Args:
455
+ text: The text to type.
456
  """
457
+ clean_text = unicodedata.normalize("NFD", text)
458
+ self.sandbox.type_text(clean_text)
 
459
  return f"Typed text: '{clean_text}'"
460
 
461
  @tool
462
  def press_key(key: str) -> str:
463
  """
464
+ Presses a keyboard key (e.g., 'enter', 'backspace', 'esc').
465
  Args:
466
+ key: The key name.
467
  """
468
+ self.sandbox.press_key(key)
 
469
  return f"Pressed key: {key}"
470
 
 
 
 
 
 
 
 
 
 
471
  @tool
472
  def drag_and_drop(x1: int, y1: int, x2: int, y2: int) -> str:
473
  """
474
+ Drags from (x1, y1) and drops at (x2, y2).
475
  Args:
476
+ x1: Start x coordinate.
477
+ y1: Start y coordinate.
478
+ x2: End x coordinate.
479
+ y2: End y coordinate.
480
  """
481
+ self.sandbox.drag_and_drop(x1, y1, x2, y2)
482
+ return f"Dragged from [{x1}, {y1}] to [{x2}, {y2}]"
 
 
483
 
484
  @tool
485
+ def scroll(amount: int, direction: str = "down") -> str:
486
  """
487
+ Scrolls the page.
488
  Args:
489
+ amount: The amount to scroll (1-10).
490
+ direction: "up" or "down".
 
 
491
  """
492
+ self.sandbox.scroll(amount, direction)
493
+ return f"Scrolled {direction} by {amount}"
 
 
 
494
 
495
  @tool
496
  def wait(seconds: float) -> str:
497
  """
498
  Waits for the specified number of seconds.
499
  Args:
500
+ seconds: The duration to wait.
501
  """
502
  time.sleep(seconds)
 
503
  return f"Waited for {seconds} seconds"
504
 
505
  @tool
506
  def open_url(url: str) -> str:
507
  """
508
+ Navigates the browser to the specified URL.
509
  Args:
510
+ url: The URL to open.
511
  """
512
  if not url.startswith(("http://", "https://")):
513
  url = "https://" + url
514
+ try:
515
+ self.sandbox.driver.get(url)
516
+ time.sleep(2)
517
+ title = self.sandbox.driver.title
518
+ return f"Opened URL: {url}. Page Title: {title}"
519
+ except Exception as e:
520
+ return f"Failed to open URL: {e}"
521
 
522
  @tool
523
+ def go_back() -> str:
524
  """
525
+ Goes back to the previous page in history.
 
 
526
  """
527
+ self.sandbox.driver.back()
528
+ return "Went back one page"
529
+
 
 
 
 
 
 
 
 
 
 
530
  self.tools["click"] = click
531
  self.tools["right_click"] = right_click
532
  self.tools["double_click"] = double_click
 
533
  self.tools["type_text"] = type_text
534
  self.tools["press_key"] = press_key
535
+ self.tools["drag_and_drop"] = drag_and_drop
536
  self.tools["scroll"] = scroll
537
  self.tools["wait"] = wait
538
  self.tools["open_url"] = open_url
539
  self.tools["go_back"] = go_back
 
 
540
 
 
 
 
541
 
542
+ def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
543
+ """Takes a screenshot and saves it to memory"""
544
  current_step = memory_step.step_number
545
+ time.sleep(1.0) # Wait for renders
546
 
547
+ image = self.sandbox.get_screenshot()
 
 
548
 
549
+ # Save to disk
550
  screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
551
  image.save(screenshot_path)
552
 
553
  image_copy = image.copy()
 
554
  if getattr(self, "click_coordinates", None):
555
  image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
556
 
557
  self.last_marked_screenshot = AgentImage(screenshot_path)
558
+
559
+ # Cleanup old images in memory to save RAM
 
560
  for previous_memory_step in agent.memory.steps:
561
+ if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 1:
 
 
 
562
  previous_memory_step.observations_images = None
563
  elif isinstance(previous_memory_step, TaskStep):
564
  previous_memory_step.task_images = None
565
 
 
566
  memory_step.observations_images = [image_copy]
567
+ self.click_coordinates = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
 
 
 
 
569
 
570
+ def create_agent(data_dir, sandbox):
571
+ model = FaraLocalModel()
572
+ return SeleniumVisionAgent(
573
+ model=model,
574
+ data_dir=data_dir,
575
+ sandbox=sandbox,
576
+ max_steps=30,
577
+ verbosity_level=2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  )
 
 
 
579
 
580
+ def generate_interaction_id(session_uuid):
581
+ return f"{session_uuid}_{int(time.time())}"
582
+
583
+ def get_agent_summary_erase_images(agent):
584
+ for memory_step in agent.memory.steps:
585
+ if hasattr(memory_step, "observations_images"):
586
+ memory_step.observations_images = None
587
+ if hasattr(memory_step, "task_images"):
588
+ memory_step.task_images = None
589
+ return agent.write_memory_to_messages()
590
 
591
  def save_final_status(folder, status: str, summary, error_message=None) -> None:
592
  try:
 
600
  except Exception as e:
601
  print(f"Failed to save metadata: {e}")
602
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
  # -----------------------------------------------------------------------------
604
+ # UI & APP
605
  # -----------------------------------------------------------------------------
606
 
607
+ # CSS
608
  custom_css = """
609
  .modal-container { margin: var(--size-16) auto!important; }
610
+ .browser-container { position: relative; width: 100%; height: 600px; border: 1px solid #444; background: #222; display: flex; align-items: center; justify-content: center; overflow: hidden; }
611
+ .browser-image { max-width: 100%; max-height: 100%; object-fit: contain; }
612
+ #chatbot { height: 800px!important; }
613
+ """
614
+
615
+ # HTML Template for the output area (Replaced IFrame with simple image holder logic handled by Gradio Image)
616
+ browser_html = """
617
+ <div class="browser-container">
618
+ <p style="color: #888;">Browser Screenshot will appear here after steps.</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
  </div>
620
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
 
622
  class EnrichedGradioUI(GradioUI):
623
  def interact_with_agent(
 
630
  request: gr.Request,
631
  ):
632
  interaction_id = generate_interaction_id(session_uuid)
 
 
633
  data_dir = os.path.join(TMP_DIR, interaction_id)
634
+
635
+ # Initialize Sandbox per run (clean slate)
636
+ sandbox = SeleniumSandbox(width=WIDTH, height=HEIGHT)
637
+ agent = create_agent(data_dir=data_dir, sandbox=sandbox)
638
+ session_state["agent"] = agent
 
 
 
639
 
640
  try:
641
  stored_messages.append(gr.ChatMessage(role="user", content=task_input))
642
+ yield stored_messages, None
 
 
 
643
 
644
+ # Initial screenshot
645
+ screenshot = sandbox.get_screenshot()
646
+
647
+ # Run Agent
648
  for msg in stream_to_gradio(
649
  agent,
650
  task=task_input,
651
+ task_images=[screenshot],
652
  reset_agent_memory=False,
653
  ):
654
+ # Update Chat
655
+ if hasattr(agent, "last_marked_screenshot") and msg.content == "-----":
656
+ # Add image to chat
 
657
  stored_messages.append(
658
  gr.ChatMessage(
659
  role="assistant",
 
663
  },
664
  )
665
  )
666
+ # Yield updated chat AND the latest screenshot for the side panel
667
+ yield stored_messages, agent.last_marked_screenshot.to_string()
668
+ else:
669
+ stored_messages.append(msg)
670
+ yield stored_messages, None # Keep previous side image
671
 
672
  if consent_storage:
673
  summary = get_agent_summary_erase_images(agent)
674
  save_final_status(data_dir, "completed", summary=summary)
675
+
676
+ yield stored_messages, None
677
 
678
  except Exception as e:
679
  error_message = f"Error in interaction: {str(e)}"
 
681
  stored_messages.append(
682
  gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message)
683
  )
684
+ yield stored_messages, None
685
+ finally:
686
+ # Important: Cleanup Sandbox
687
+ sandbox.cleanup()
 
 
 
688
 
689
+ # Gradio Block Construction
690
  theme = gr.themes.Default(
691
  font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue"
692
  )
693
 
694
+ with gr.Blocks(theme=theme, css=custom_css) as demo:
695
+ session_uuid_state = gr.State(lambda: str(uuid.uuid4()))
696
+ session_state = gr.State({})
697
+ stored_messages = gr.State([])
698
 
699
  with gr.Row():
700
+ # Left Sidebar: Inputs
701
+ with gr.Column(scale=1):
702
+ gr.Markdown("### Fara CUA - Chrome Agent 🌐")
703
+
 
 
 
 
 
 
 
 
 
 
 
704
  task_input = gr.Textbox(
705
+ value="Go to google.com and search for 'Hugging Face'",
706
+ label="Task",
707
+ lines=3
708
  )
709
+ run_btn = gr.Button("Start Task", variant="primary")
710
+ stop_btn = gr.Button("Stop", variant="secondary")
711
 
712
+ consent_storage = gr.Checkbox(label="Save logs locally?", value=True)
 
 
713
 
714
  gr.Examples(
715
  examples=[
716
+ "Go to google.com and search for 'Hugging Face', then click the first link.",
717
+ "Go to wikipedia.org, type 'Python' in search, and click the search button.",
718
+ "Open huggingface.co and find the 'Spaces' link."
719
  ],
720
+ inputs=task_input
721
  )
722
 
723
+ # Right Main: Chat & Live View
724
+ with gr.Column(scale=3):
725
+ with gr.Row():
726
+ # Side-by-side: Chat and Latest Screenshot
727
+ with gr.Column(scale=1):
728
+ chatbot_display = gr.Chatbot(
729
+ label="Agent Trace",
730
+ type="messages",
731
+ height=800,
732
+ avatar_images=(None, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png"),
733
+ )
734
+ with gr.Column(scale=1):
735
+ gr.Markdown("### Latest Browser View")
736
+ live_browser_view = gr.Image(
737
+ label="Browser View",
738
+ type="filepath",
739
+ interactive=False,
740
+ height=600
741
+ )
742
 
743
+ # UI Handler
 
744
  agent_ui = EnrichedGradioUI(CodeAgent(tools=[], model=Model(), name="init"))
745
 
 
 
 
 
 
 
 
 
746
  def interrupt_agent(session_state):
747
+ if "agent" in session_state and hasattr(session_state["agent"], "interrupt_switch"):
748
+ session_state["agent"].interrupt_switch = True
749
+ return "Interrupted"
750
+
751
+ # Run Logic
752
+ run_event = run_btn.click(
753
+ fn=agent_ui.interact_with_agent,
754
+ inputs=[
755
+ task_input,
756
+ stored_messages,
757
+ session_state,
758
+ session_uuid_state,
759
+ consent_storage,
760
+ ],
761
+ outputs=[chatbot_display, live_browser_view]
 
 
 
 
 
 
 
 
 
762
  )
763
 
764
  stop_btn.click(fn=interrupt_agent, inputs=[session_state], outputs=[])
765
 
 
 
 
 
 
 
 
 
 
 
 
766
  if __name__ == "__main__":
 
767
  demo.launch()