Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on 16 days ago

Commit

6308c59

verified ·

1 Parent(s): 79a20f7

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -58

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ from datetime import datetime
 import gradio as gr
 import torch
-import spaces  # <--- Added Spaces support
 from dotenv import load_dotenv
 from e2b_desktop import Sandbox
 from gradio_modal import Modal
@@ -58,7 +58,12 @@ if not os.path.exists(TMP_DIR):
 print("Loading Fara Model... This may take a moment.")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-MODEL_ID_F = "microsoft/Fara-7B"  # Ensure this repository exists and you have access
 try:
     processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
@@ -73,7 +78,6 @@ except Exception as e:
     print(f"Error loading Fara Model: {e}")
     print("Falling back to Qwen/Qwen2.5-VL-7B-Instruct for demonstration if Fara is unavailable...")
     try:
-        # Fallback to base Qwen-VL if Fara repo isn't public/accessible
         MODEL_ID_F = "Qwen/Qwen2.5-VL-7B-Instruct"
         processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
         model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -85,17 +89,67 @@ except Exception as e:
         print(f"Fallback Model ({MODEL_ID_F}) loaded successfully.")
     except Exception as inner_e:
         print(f"Critical error loading model: {inner_e}")
-        model_f = None
-        processor_f = None
 class FaraLocalModel(Model):
     """
     Wrapper for the local Fara (Qwen2.5-VL) model to work with SmolAgents.
     """
-    def __init__(self, model, processor, **kwargs):
         super().__init__(**kwargs)
-        self.model = model
-        self.processor = processor
     def __call__(
         self,
@@ -103,12 +157,11 @@ class FaraLocalModel(Model):
         stop_sequences: Optional[List[str]] = None,
         **kwargs,
     ) -> ChatMessage:
-        if self.model is None:
-            raise ValueError("Fara Model is not loaded.")
         formatted_messages = []
         # Convert SmolAgents messages to Qwen/Transformers format
         for msg in messages:
             role = msg["role"]
             content = msg["content"]
@@ -124,7 +177,7 @@ class FaraLocalModel(Model):
                     elif isinstance(item, dict):
                         if "type" in item:
                             if item["type"] == "image":
-                                # Handle path or url
                                 val = item.get("image") or item.get("url") or item.get("path")
                                 new_content.append({"type": "image", "image": val})
                             else:
@@ -132,39 +185,14 @@ class FaraLocalModel(Model):
             formatted_messages.append({"role": role, "content": new_content})
-        # Process Inputs
-        text = self.processor.apply_chat_template(
-            formatted_messages, tokenize=False, add_generation_prompt=True
         )
-        image_inputs, video_inputs = process_vision_info(formatted_messages)
-        inputs = self.processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(self.model.device)
-        # Generate
-        with torch.no_grad():
-            generated_ids = self.model.generate(
-                **inputs,
-                max_new_tokens=kwargs.get("max_tokens", 1024),
-                stop_strings=stop_sequences,
-                tokenizer=self.processor.tokenizer,
-            )
-        # Decode
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = self.processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
         return ChatMessage(
             role=MessageRole.ASSISTANT,
@@ -615,11 +643,8 @@ def save_final_status(folder, status: str, summary, error_message=None) -> None:
         print(f"Failed to save metadata: {e}")
 def create_agent(data_dir, desktop):
-    # Instantiate the local model wrapper
-    if model_f is None:
-        raise RuntimeError("Fara model was not loaded successfully.")
-    model = FaraLocalModel(model=model_f, processor=processor_f)
     return E2BVisionAgent(
         model=model,
@@ -755,7 +780,7 @@ def initialize_session(interactive_mode, browser_uuid):
         return update_html(interactive_mode, browser_uuid), browser_uuid
 class EnrichedGradioUI(GradioUI):
-    @spaces.GPU(duration=180) # Allocate GPU for 3 minutes per interaction cycle
     def interact_with_agent(
         self,
         task_input,
@@ -772,31 +797,34 @@ class EnrichedGradioUI(GradioUI):
         if not os.path.exists(data_dir):
             os.makedirs(data_dir)
-        # Re-create agent to ensure fresh context with the Fara model
-        session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
         try:
             stored_messages.append(gr.ChatMessage(role="user", content=task_input))
             yield stored_messages
-            screenshot_bytes = session_state["agent"].desktop.screenshot(format="bytes")
             initial_screenshot = Image.open(BytesIO(screenshot_bytes))
             for msg in stream_to_gradio(
-                session_state["agent"],
                 task=task_input,
                 task_images=[initial_screenshot],
                 reset_agent_memory=False,
             ):
                 if (
-                    hasattr(session_state["agent"], "last_marked_screenshot")
                     and msg.content == "-----"
                 ):
                     stored_messages.append(
                         gr.ChatMessage(
                             role="assistant",
                             content={
-                                "path": session_state["agent"].last_marked_screenshot.to_string(),
                                 "mime_type": "image/png",
                             },
                         )
@@ -805,7 +833,7 @@ class EnrichedGradioUI(GradioUI):
                 yield stored_messages
             if consent_storage:
-                summary = get_agent_summary_erase_images(session_state["agent"])
                 save_final_status(data_dir, "completed", summary=summary)
             yield stored_messages
@@ -891,7 +919,7 @@ This agent uses **microsoft/Fara-7B** (running locally via ZeroGPU) and **smolag
         return update_html(True, session_uuid)
     def interrupt_agent(session_state):
-        if "agent" in session_state and not session_state["agent"].interrupt_switch:
             session_state["agent"].interrupt()
             return "Stopped"
         return "Stop"

 import gradio as gr
 import torch
+import spaces
 from dotenv import load_dotenv
 from e2b_desktop import Sandbox
 from gradio_modal import Modal
 print("Loading Fara Model... This may take a moment.")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Using the Microsoft Fara model as requested
+MODEL_ID_F = "microsoft/Fara-7B"
+# Global model variables
+model_f = None
+processor_f = None
 try:
     processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
     print(f"Error loading Fara Model: {e}")
     print("Falling back to Qwen/Qwen2.5-VL-7B-Instruct for demonstration if Fara is unavailable...")
     try:
         MODEL_ID_F = "Qwen/Qwen2.5-VL-7B-Instruct"
         processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
         model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         print(f"Fallback Model ({MODEL_ID_F}) loaded successfully.")
     except Exception as inner_e:
         print(f"Critical error loading model: {inner_e}")
+# -----------------------------------------------------------------------------
+# GPU ISOLATED INFERENCE FUNCTION
+# -----------------------------------------------------------------------------
+@spaces.GPU(duration=120)
+def run_model_inference(formatted_messages, max_tokens=1024, stop_sequences=None):
+    """
+    This function runs on the GPU worker.
+    It receives simple python objects (lists/dicts), not the complex Agent object.
+    """
+    global model_f, processor_f
+    if model_f is None:
+        raise ValueError("Model is not loaded.")
+    # Process Inputs (Tokenization happens here to ensure tensors are on correct device)
+    text = processor_f.apply_chat_template(
+        formatted_messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(formatted_messages)
+    inputs = processor_f(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    # Move inputs to the model's device (GPU)
+    inputs = inputs.to(model_f.device)
+    # Generate
+    with torch.no_grad():
+        generated_ids = model_f.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            stop_strings=stop_sequences,
+            tokenizer=processor_f.tokenizer,
+        )
+    # Decode
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor_f.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    return output_text
 class FaraLocalModel(Model):
     """
     Wrapper for the local Fara (Qwen2.5-VL) model to work with SmolAgents.
     """
+    def __init__(self, **kwargs):
         super().__init__(**kwargs)
     def __call__(
         self,
         stop_sequences: Optional[List[str]] = None,
         **kwargs,
     ) -> ChatMessage:
         formatted_messages = []
         # Convert SmolAgents messages to Qwen/Transformers format
+        # We perform this conversion here (CPU side) to create simple dicts/lists
         for msg in messages:
             role = msg["role"]
             content = msg["content"]
                     elif isinstance(item, dict):
                         if "type" in item:
                             if item["type"] == "image":
+                                # Handle path or url - extract value to ensure serializability
                                 val = item.get("image") or item.get("url") or item.get("path")
                                 new_content.append({"type": "image", "image": val})
                             else:
             formatted_messages.append({"role": role, "content": new_content})
+        # Call the decorated global function
+        # This crosses the boundary to the GPU worker safely because
+        # formatted_messages contains only standard Python types (str, list, dict, PIL.Image)
+        output_text = run_model_inference(
+            formatted_messages=formatted_messages,
+            max_tokens=kwargs.get("max_tokens", 1024),
+            stop_sequences=stop_sequences
         )
         return ChatMessage(
             role=MessageRole.ASSISTANT,
         print(f"Failed to save metadata: {e}")
 def create_agent(data_dir, desktop):
+    # Initialize the wrapper that calls the global GPU function
+    model = FaraLocalModel()
     return E2BVisionAgent(
         model=model,
         return update_html(interactive_mode, browser_uuid), browser_uuid
 class EnrichedGradioUI(GradioUI):
+    # REMOVED @spaces.GPU from here to prevent pickling the E2B Sandbox (which has locks)
     def interact_with_agent(
         self,
         task_input,
         if not os.path.exists(data_dir):
             os.makedirs(data_dir)
+        # Create fresh agent.
+        # Note: We do NOT store the full agent in session_state passed between Gradio events
+        # if possible, or if we do, we ensure this function isn't wrapped in @spaces.GPU
+        agent = create_agent(data_dir=data_dir, desktop=desktop)
+        session_state["agent"] = agent # Storing in state is fine if this function runs on CPU
         try:
             stored_messages.append(gr.ChatMessage(role="user", content=task_input))
             yield stored_messages
+            screenshot_bytes = agent.desktop.screenshot(format="bytes")
             initial_screenshot = Image.open(BytesIO(screenshot_bytes))
             for msg in stream_to_gradio(
+                agent,
                 task=task_input,
                 task_images=[initial_screenshot],
                 reset_agent_memory=False,
             ):
                 if (
+                    hasattr(agent, "last_marked_screenshot")
                     and msg.content == "-----"
                 ):
                     stored_messages.append(
                         gr.ChatMessage(
                             role="assistant",
                             content={
+                                "path": agent.last_marked_screenshot.to_string(),
                                 "mime_type": "image/png",
                             },
                         )
                 yield stored_messages
             if consent_storage:
+                summary = get_agent_summary_erase_images(agent)
                 save_final_status(data_dir, "completed", summary=summary)
             yield stored_messages
         return update_html(True, session_uuid)
     def interrupt_agent(session_state):
+        if "agent" in session_state and hasattr(session_state["agent"], "interrupt_switch") and not session_state["agent"].interrupt_switch:
             session_state["agent"].interrupt()
             return "Stopped"
         return "Stop"