Spaces:
Running
on
Zero
Running
on
Zero
update app
Browse files
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import os
|
|
| 3 |
import shutil
|
| 4 |
import time
|
| 5 |
import uuid
|
| 6 |
-
import spaces #[zeroGPU Spaces]
|
| 7 |
import unicodedata
|
| 8 |
from io import BytesIO
|
| 9 |
from threading import Timer
|
|
@@ -12,6 +11,7 @@ from datetime import datetime
|
|
| 12 |
|
| 13 |
import gradio as gr
|
| 14 |
import torch
|
|
|
|
| 15 |
from dotenv import load_dotenv
|
| 16 |
from e2b_desktop import Sandbox
|
| 17 |
from gradio_modal import Modal
|
|
@@ -39,7 +39,7 @@ load_dotenv(override=True)
|
|
| 39 |
# -----------------------------------------------------------------------------
|
| 40 |
|
| 41 |
E2B_API_KEY = os.getenv("E2B_API")
|
| 42 |
-
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("
|
| 43 |
if HF_TOKEN:
|
| 44 |
login(token=HF_TOKEN)
|
| 45 |
|
|
@@ -58,7 +58,7 @@ if not os.path.exists(TMP_DIR):
|
|
| 58 |
|
| 59 |
print("Loading Fara Model... This may take a moment.")
|
| 60 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 61 |
-
MODEL_ID_F = "microsoft/Fara-7B" # Ensure this
|
| 62 |
|
| 63 |
try:
|
| 64 |
processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
|
|
@@ -66,19 +66,27 @@ try:
|
|
| 66 |
MODEL_ID_F,
|
| 67 |
trust_remote_code=True,
|
| 68 |
torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
|
| 69 |
-
device_map="auto"
|
| 70 |
)
|
| 71 |
-
if DEVICE == "cpu":
|
| 72 |
-
model_f.to(DEVICE)
|
| 73 |
-
|
| 74 |
-
model_f.eval()
|
| 75 |
print(f"Fara Model loaded successfully on {DEVICE}")
|
| 76 |
except Exception as e:
|
| 77 |
print(f"Error loading Fara Model: {e}")
|
| 78 |
-
print("
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
class FaraLocalModel(Model):
|
| 84 |
"""
|
|
@@ -98,12 +106,9 @@ class FaraLocalModel(Model):
|
|
| 98 |
if self.model is None:
|
| 99 |
raise ValueError("Fara Model is not loaded.")
|
| 100 |
|
| 101 |
-
# Convert SmolAgents messages to Qwen/Transformers format
|
| 102 |
-
# SmolAgents uses a specific dict structure for content.
|
| 103 |
-
# We need to normalize it for process_vision_info / apply_chat_template
|
| 104 |
-
|
| 105 |
formatted_messages = []
|
| 106 |
|
|
|
|
| 107 |
for msg in messages:
|
| 108 |
role = msg["role"]
|
| 109 |
content = msg["content"]
|
|
@@ -150,7 +155,7 @@ class FaraLocalModel(Model):
|
|
| 150 |
**inputs,
|
| 151 |
max_new_tokens=kwargs.get("max_tokens", 1024),
|
| 152 |
stop_strings=stop_sequences,
|
| 153 |
-
tokenizer=self.processor.tokenizer,
|
| 154 |
)
|
| 155 |
|
| 156 |
# Decode
|
|
@@ -185,7 +190,7 @@ Action:
|
|
| 185 |
click(254, 308)
|
| 186 |
```<end_code>
|
| 187 |
|
| 188 |
-
|
| 189 |
</action_process>
|
| 190 |
|
| 191 |
<tools>
|
|
@@ -220,7 +225,7 @@ In browser, ignore any sign-in popups while they don't interfere with the elemen
|
|
| 220 |
</general_guidelines>
|
| 221 |
""".replace("<<current_date>>", datetime.now().strftime("%A, %d-%B-%Y"))
|
| 222 |
|
| 223 |
-
|
| 224 |
def draw_marker_on_image(image_copy, click_coordinates):
|
| 225 |
x, y = click_coordinates
|
| 226 |
draw = ImageDraw.Draw(image_copy)
|
|
@@ -750,6 +755,7 @@ def initialize_session(interactive_mode, browser_uuid):
|
|
| 750 |
return update_html(interactive_mode, browser_uuid), browser_uuid
|
| 751 |
|
| 752 |
class EnrichedGradioUI(GradioUI):
|
|
|
|
| 753 |
def interact_with_agent(
|
| 754 |
self,
|
| 755 |
task_input,
|
|
@@ -836,7 +842,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
|
|
| 836 |
with gr.Sidebar(position="left"):
|
| 837 |
with Modal(visible=True) as modal:
|
| 838 |
gr.Markdown("""### Welcome to Fara CUA Demo 🖥️
|
| 839 |
-
This agent uses **microsoft/Fara-7B** (running locally) and **smolagents** to control a remote computer.
|
| 840 |
|
| 841 |
👉 Type a task, click 'Let's go!', and watch the agent work.
|
| 842 |
""")
|
|
|
|
| 3 |
import shutil
|
| 4 |
import time
|
| 5 |
import uuid
|
|
|
|
| 6 |
import unicodedata
|
| 7 |
from io import BytesIO
|
| 8 |
from threading import Timer
|
|
|
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
import torch
|
| 14 |
+
import spaces # <--- Added Spaces support
|
| 15 |
from dotenv import load_dotenv
|
| 16 |
from e2b_desktop import Sandbox
|
| 17 |
from gradio_modal import Modal
|
|
|
|
| 39 |
# -----------------------------------------------------------------------------
|
| 40 |
|
| 41 |
E2B_API_KEY = os.getenv("E2B_API")
|
| 42 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
|
| 43 |
if HF_TOKEN:
|
| 44 |
login(token=HF_TOKEN)
|
| 45 |
|
|
|
|
| 58 |
|
| 59 |
print("Loading Fara Model... This may take a moment.")
|
| 60 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 61 |
+
MODEL_ID_F = "microsoft/Fara-7B" # Ensure this repository exists and you have access
|
| 62 |
|
| 63 |
try:
|
| 64 |
processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
|
|
|
|
| 66 |
MODEL_ID_F,
|
| 67 |
trust_remote_code=True,
|
| 68 |
torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
|
| 69 |
+
device_map="auto",
|
| 70 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
print(f"Fara Model loaded successfully on {DEVICE}")
|
| 72 |
except Exception as e:
|
| 73 |
print(f"Error loading Fara Model: {e}")
|
| 74 |
+
print("Falling back to Qwen/Qwen2.5-VL-7B-Instruct for demonstration if Fara is unavailable...")
|
| 75 |
+
try:
|
| 76 |
+
# Fallback to base Qwen-VL if Fara repo isn't public/accessible
|
| 77 |
+
MODEL_ID_F = "Qwen/Qwen2.5-VL-7B-Instruct"
|
| 78 |
+
processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
|
| 79 |
+
model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 80 |
+
MODEL_ID_F,
|
| 81 |
+
trust_remote_code=True,
|
| 82 |
+
torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
|
| 83 |
+
device_map="auto",
|
| 84 |
+
)
|
| 85 |
+
print(f"Fallback Model ({MODEL_ID_F}) loaded successfully.")
|
| 86 |
+
except Exception as inner_e:
|
| 87 |
+
print(f"Critical error loading model: {inner_e}")
|
| 88 |
+
model_f = None
|
| 89 |
+
processor_f = None
|
| 90 |
|
| 91 |
class FaraLocalModel(Model):
|
| 92 |
"""
|
|
|
|
| 106 |
if self.model is None:
|
| 107 |
raise ValueError("Fara Model is not loaded.")
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
formatted_messages = []
|
| 110 |
|
| 111 |
+
# Convert SmolAgents messages to Qwen/Transformers format
|
| 112 |
for msg in messages:
|
| 113 |
role = msg["role"]
|
| 114 |
content = msg["content"]
|
|
|
|
| 155 |
**inputs,
|
| 156 |
max_new_tokens=kwargs.get("max_tokens", 1024),
|
| 157 |
stop_strings=stop_sequences,
|
| 158 |
+
tokenizer=self.processor.tokenizer,
|
| 159 |
)
|
| 160 |
|
| 161 |
# Decode
|
|
|
|
| 190 |
click(254, 308)
|
| 191 |
```<end_code>
|
| 192 |
|
| 193 |
+
Always format your action ('Action:' part) as Python code blocks as shown above.
|
| 194 |
</action_process>
|
| 195 |
|
| 196 |
<tools>
|
|
|
|
| 225 |
</general_guidelines>
|
| 226 |
""".replace("<<current_date>>", datetime.now().strftime("%A, %d-%B-%Y"))
|
| 227 |
|
| 228 |
+
|
| 229 |
def draw_marker_on_image(image_copy, click_coordinates):
|
| 230 |
x, y = click_coordinates
|
| 231 |
draw = ImageDraw.Draw(image_copy)
|
|
|
|
| 755 |
return update_html(interactive_mode, browser_uuid), browser_uuid
|
| 756 |
|
| 757 |
class EnrichedGradioUI(GradioUI):
|
| 758 |
+
@spaces.GPU(duration=180) # Allocate GPU for 3 minutes per interaction cycle
|
| 759 |
def interact_with_agent(
|
| 760 |
self,
|
| 761 |
task_input,
|
|
|
|
| 842 |
with gr.Sidebar(position="left"):
|
| 843 |
with Modal(visible=True) as modal:
|
| 844 |
gr.Markdown("""### Welcome to Fara CUA Demo 🖥️
|
| 845 |
+
This agent uses **microsoft/Fara-7B** (running locally via ZeroGPU) and **smolagents** to control a remote computer.
|
| 846 |
|
| 847 |
👉 Type a task, click 'Let's go!', and watch the agent work.
|
| 848 |
""")
|