Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on 11 days ago

Commit

79a20f7

verified ·

1 Parent(s): c715222

update app

Browse files

Files changed (1) hide show

app.py +26 -20

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import os
 import shutil
 import time
 import uuid
-import spaces #[zeroGPU Spaces]
 import unicodedata
 from io import BytesIO
 from threading import Timer
@@ -12,6 +11,7 @@ from datetime import datetime
 import gradio as gr
 import torch
 from dotenv import load_dotenv
 from e2b_desktop import Sandbox
 from gradio_modal import Modal
@@ -39,7 +39,7 @@ load_dotenv(override=True)
 # -----------------------------------------------------------------------------
 E2B_API_KEY = os.getenv("E2B_API")
-HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API")
 if HF_TOKEN:
     login(token=HF_TOKEN)
@@ -58,7 +58,7 @@ if not os.path.exists(TMP_DIR):
 print("Loading Fara Model... This may take a moment.")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-MODEL_ID_F = "microsoft/Fara-7B"  # Ensure this ID is accessible or point to local path
 try:
     processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
@@ -66,19 +66,27 @@ try:
         MODEL_ID_F,
         trust_remote_code=True,
         torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
-        device_map="auto" if DEVICE == "cuda" else None,
     )
-    if DEVICE == "cpu":
-        model_f.to(DEVICE)
-    model_f.eval()
     print(f"Fara Model loaded successfully on {DEVICE}")
 except Exception as e:
     print(f"Error loading Fara Model: {e}")
-    print("Please ensure you have access to the model and enough GPU memory.")
-    # Fallback to prevent crash during import, though app won't work without model
-    model_f = None
-    processor_f = None
 class FaraLocalModel(Model):
     """
@@ -98,12 +106,9 @@ class FaraLocalModel(Model):
         if self.model is None:
             raise ValueError("Fara Model is not loaded.")
-        # Convert SmolAgents messages to Qwen/Transformers format
-        # SmolAgents uses a specific dict structure for content.
-        # We need to normalize it for process_vision_info / apply_chat_template
         formatted_messages = []
         for msg in messages:
             role = msg["role"]
             content = msg["content"]
@@ -150,7 +155,7 @@ class FaraLocalModel(Model):
                 **inputs,
                 max_new_tokens=kwargs.get("max_tokens", 1024),
                 stop_strings=stop_sequences,
-                tokenizer=self.processor.tokenizer, # Specific for stop_strings in modern transformers
             )
         # Decode
@@ -185,7 +190,7 @@ Action:
 click(254, 308)
 ```<end_code>
-Akways format your action ('Action:' part) as Python code blocks as shown above.
 </action_process>
 <tools>
@@ -220,7 +225,7 @@ In browser, ignore any sign-in popups while they don't interfere with the elemen
 </general_guidelines>
 """.replace("<<current_date>>", datetime.now().strftime("%A, %d-%B-%Y"))
-@spaces.GPU
 def draw_marker_on_image(image_copy, click_coordinates):
     x, y = click_coordinates
     draw = ImageDraw.Draw(image_copy)
@@ -750,6 +755,7 @@ def initialize_session(interactive_mode, browser_uuid):
         return update_html(interactive_mode, browser_uuid), browser_uuid
 class EnrichedGradioUI(GradioUI):
     def interact_with_agent(
         self,
         task_input,
@@ -836,7 +842,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
         with gr.Sidebar(position="left"):
             with Modal(visible=True) as modal:
                 gr.Markdown("""### Welcome to Fara CUA Demo 🖥️
-This agent uses **microsoft/Fara-7B** (running locally) and **smolagents** to control a remote computer.
 👉 Type a task, click 'Let's go!', and watch the agent work.
 """)

 import shutil
 import time
 import uuid
 import unicodedata
 from io import BytesIO
 from threading import Timer
 import gradio as gr
 import torch
+import spaces  # <--- Added Spaces support
 from dotenv import load_dotenv
 from e2b_desktop import Sandbox
 from gradio_modal import Modal
 # -----------------------------------------------------------------------------
 E2B_API_KEY = os.getenv("E2B_API")
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
 if HF_TOKEN:
     login(token=HF_TOKEN)
 print("Loading Fara Model... This may take a moment.")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_ID_F = "microsoft/Fara-7B"  # Ensure this repository exists and you have access
 try:
     processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
         MODEL_ID_F,
         trust_remote_code=True,
         torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
+        device_map="auto",
     )
     print(f"Fara Model loaded successfully on {DEVICE}")
 except Exception as e:
     print(f"Error loading Fara Model: {e}")
+    print("Falling back to Qwen/Qwen2.5-VL-7B-Instruct for demonstration if Fara is unavailable...")
+    try:
+        # Fallback to base Qwen-VL if Fara repo isn't public/accessible
+        MODEL_ID_F = "Qwen/Qwen2.5-VL-7B-Instruct"
+        processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
+        model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            MODEL_ID_F,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
+            device_map="auto",
+        )
+        print(f"Fallback Model ({MODEL_ID_F}) loaded successfully.")
+    except Exception as inner_e:
+        print(f"Critical error loading model: {inner_e}")
+        model_f = None
+        processor_f = None
 class FaraLocalModel(Model):
     """
         if self.model is None:
             raise ValueError("Fara Model is not loaded.")
         formatted_messages = []
+        # Convert SmolAgents messages to Qwen/Transformers format
         for msg in messages:
             role = msg["role"]
             content = msg["content"]
                 **inputs,
                 max_new_tokens=kwargs.get("max_tokens", 1024),
                 stop_strings=stop_sequences,
+                tokenizer=self.processor.tokenizer,
             )
         # Decode
 click(254, 308)
 ```<end_code>
+Always format your action ('Action:' part) as Python code blocks as shown above.
 </action_process>
 <tools>
 </general_guidelines>
 """.replace("<<current_date>>", datetime.now().strftime("%A, %d-%B-%Y"))
 def draw_marker_on_image(image_copy, click_coordinates):
     x, y = click_coordinates
     draw = ImageDraw.Draw(image_copy)
         return update_html(interactive_mode, browser_uuid), browser_uuid
 class EnrichedGradioUI(GradioUI):
+    @spaces.GPU(duration=180) # Allocate GPU for 3 minutes per interaction cycle
     def interact_with_agent(
         self,
         task_input,
         with gr.Sidebar(position="left"):
             with Modal(visible=True) as modal:
                 gr.Markdown("""### Welcome to Fara CUA Demo 🖥️
+This agent uses **microsoft/Fara-7B** (running locally via ZeroGPU) and **smolagents** to control a remote computer.
 👉 Type a task, click 'Let's go!', and watch the agent work.
 """)