Spaces:

lvwerra
/

agent-ui

Running

lvwerra HF Staff Claude Opus 4.6 commited on Mar 13

Commit

583c5ee

1 Parent(s): d86459e

Image agent fixes: sizing, error handling, result nudge, registry filtering

- Add max-height: 400px to all image CSS rules and inline styles
- Filter globalFigureRegistry to only include images/figures referenced
in sub-agent <result> content (prevents command center from rendering
images the sub-agent didn't explicitly include)
- Extract shared nudge_for_result() utility in agents.py, replacing
duplicated nudge code in agent.py, code.py, and image.py
- Return actual error messages from execute_generate_image/execute_edit_image
(tuple return) so the LLM can adapt its strategy on failure
- Show real error messages in frontend tool cells instead of generic
"Failed to process image"
- Resize large input images to 1024px max before sending to HF
image_to_image API (FLUX.1-Kontext-dev expects ~1024px inputs)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (7) hide show

backend/agent.py +26 -36
backend/agents.py +54 -4
backend/code.py +25 -32
backend/image.py +27 -15
backend/tools.py +57 -14
frontend/script.js +63 -9
frontend/style.css +47 -0

backend/agent.py CHANGED Viewed

@@ -13,7 +13,9 @@ from typing import List, Dict, Optional
 from .tools import (
     web_search, read_url,
     execute_web_search, execute_read_url,
 )
 logger = logging.getLogger(__name__)
@@ -95,7 +97,8 @@ def stream_agent_execution(
     messages: List[Dict],
     serper_key: str,
     extra_params: Optional[Dict] = None,
-    abort_event=None
 ):
     """
     Run the agent tool-calling loop.
@@ -229,18 +232,30 @@ def stream_agent_execution(
                 # Execute tool
                 result = execute_tool(func_name, args, serper_key)
-                # Build tool response message for LLM
-                if result.get("image"):
-                    # For screenshots, send image as vision content so LLM can see it
                     tool_response_content = [
                         {"type": "text", "text": result["content"]},
-                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{result['image']}"}}
                     ]
                 else:
                     tool_response_content = result["content"]
-                tool_response_str = tool_response_content if isinstance(tool_response_content, str) else json.dumps(tool_response_content)
                 # Add to message history
                 messages.append({
                     "role": "assistant",
@@ -250,7 +265,7 @@ def stream_agent_execution(
                 messages.append({
                     "role": "tool",
                     "tool_call_id": tool_call.id,
-                    "content": tool_response_str
                 })
                 # Signal tool result to frontend (include response for history)
@@ -259,7 +274,7 @@ def stream_agent_execution(
                     "tool": func_name,
                     "tool_call_id": tool_call.id,
                     "result": result.get("display", {}),
-                    "response": tool_response_str,
                 }
                 if result.get("image"):
                     tool_result_event["image"] = result["image"]
@@ -281,32 +296,7 @@ def stream_agent_execution(
     # If agent finished without a <result>, nudge it for one
     if not has_result:
-        messages.append({
-            "role": "user",
-            "content": "Please provide your final answer now. Wrap it in <result> tags."
-        })
-        try:
-            call_params = {
-                "messages": messages,
-                "model": model,
-            }
-            if extra_params:
-                call_params["extra_body"] = extra_params
-            response = client.chat.completions.create(**call_params)
-            nudge_content = response.choices[0].message.content or ""
-            result_match = re.search(r'<result>(.*?)</result>', nudge_content, re.DOTALL | re.IGNORECASE)
-            if result_match:
-                result_content = result_match.group(1).strip()
-                thinking = re.sub(r'<result>.*?</result>', '', nudge_content, flags=re.DOTALL | re.IGNORECASE).strip()
-                if thinking:
-                    yield {"type": "content", "content": thinking}
-                yield {"type": "result_preview", "content": result_content}
-                yield {"type": "result", "content": result_content}
-            elif nudge_content.strip():
-                # No result tags but got content — use it as the result
-                yield {"type": "result_preview", "content": nudge_content.strip()}
-                yield {"type": "result", "content": nudge_content.strip()}
-        except Exception as e:
-            logger.warning(f"Result nudge failed: {e}")
     yield {"type": "done"}

 from .tools import (
     web_search, read_url,
     execute_web_search, execute_read_url,
+    extract_and_download_images,
 )
+from .image import resize_image_for_vlm
 logger = logging.getLogger(__name__)
     messages: List[Dict],
     serper_key: str,
     extra_params: Optional[Dict] = None,
+    abort_event=None,
+    multimodal: bool = False
 ):
     """
     Run the agent tool-calling loop.
                 # Execute tool
                 result = execute_tool(func_name, args, serper_key)
+                # Build tool response content for LLM
+                if result.get("image") and multimodal:
+                    # Send screenshot as multimodal content so VLM can see it
+                    vlm_image = resize_image_for_vlm(result["image"])
                     tool_response_content = [
                         {"type": "text", "text": result["content"]},
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{vlm_image}"}}
                     ]
+                elif func_name == "read_url" and multimodal:
+                    # Extract and include page images so VLM can see them
+                    page_images = extract_and_download_images(result["content"])
+                    if page_images:
+                        tool_response_content = [{"type": "text", "text": result["content"]}]
+                        for img_b64 in page_images:
+                            vlm_img = resize_image_for_vlm(img_b64)
+                            tool_response_content.append({
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{vlm_img}"}
+                            })
+                    else:
+                        tool_response_content = result["content"]
                 else:
                     tool_response_content = result["content"]
                 # Add to message history
                 messages.append({
                     "role": "assistant",
                 messages.append({
                     "role": "tool",
                     "tool_call_id": tool_call.id,
+                    "content": tool_response_content
                 })
                 # Signal tool result to frontend (include response for history)
                     "tool": func_name,
                     "tool_call_id": tool_call.id,
                     "result": result.get("display", {}),
+                    "response": result.get("content", ""),
                 }
                 if result.get("image"):
                     tool_result_event["image"] = result["image"]
     # If agent finished without a <result>, nudge it for one
     if not has_result:
+        from .agents import nudge_for_result
+        yield from nudge_for_result(client, model, messages, extra_params=extra_params)
     yield {"type": "done"}

backend/agents.py CHANGED Viewed

@@ -82,9 +82,11 @@ AGENT_REGISTRY = {
             "(this preserves context and the Jupyter kernel for code agents).\n\n"
             "## Presenting Results\n\n"
             "Sub-agent results may be collapsed in the UI. When presenting results to the user, "
-            "always include the key findings in YOUR response text — don't just say \"see the agent result\". "
-            "If the agent produced figures, charts, or images, describe what they show and mention "
-            "that the visualization is available in the agent result widget.\n\n"
             "## Handling Aborted Agents\n\n"
             "If an agent's result is 'Generation aborted by user.', the user deliberately stopped it. "
             "Do NOT automatically re-launch the same task. Instead, briefly acknowledge the abort and "
@@ -313,7 +315,8 @@ AGENT_REGISTRY = {
             "Returns an image reference (e.g., 'image_1') that you can see.\n"
             "- **edit_image(prompt, source)**: Edit or transform an existing image. "
             "The source can be a URL, a local file path, or an image reference from a previous tool call (e.g., 'image_1').\n"
-            "- **read_image(source)**: Load an image from a URL or local file path. "
             "Returns an image reference that you can see and use with edit_image.\n\n"
             "## Strategy\n\n"
             "1. If the user provides an image URL or file path, use read_image first to load it\n"
@@ -385,6 +388,53 @@ def get_system_prompt(agent_key: str) -> str:
     return prompt
 def get_tools() -> list:
     """Get tool definitions for the command center."""
     return [

             "(this preserves context and the Jupyter kernel for code agents).\n\n"
             "## Presenting Results\n\n"
             "Sub-agent results may be collapsed in the UI. When presenting results to the user, "
+            "always include the key findings in YOUR response text — don't just say \"see the agent result\".\n\n"
+            "**Embedding images/figures from sub-agents:** If a sub-agent result contains image or figure "
+            "references like <image_1> or <figure_1>, you can embed them directly in your response using "
+            "the same tags (e.g., <image_1>, <figure_2>). The UI will resolve these to the actual images. "
+            "Always embed the relevant images when discussing visual results.\n\n"
             "## Handling Aborted Agents\n\n"
             "If an agent's result is 'Generation aborted by user.', the user deliberately stopped it. "
             "Do NOT automatically re-launch the same task. Instead, briefly acknowledge the abort and "
             "Returns an image reference (e.g., 'image_1') that you can see.\n"
             "- **edit_image(prompt, source)**: Edit or transform an existing image. "
             "The source can be a URL, a local file path, or an image reference from a previous tool call (e.g., 'image_1').\n"
+            "- **read_image(source)**: Load a raster image (PNG, JPEG, GIF, WebP, BMP) from a URL or local file path. "
+            "SVG is NOT supported — if given an SVG URL, tell the user and ask for a raster format instead. "
             "Returns an image reference that you can see and use with edit_image.\n\n"
             "## Strategy\n\n"
             "1. If the user provides an image URL or file path, use read_image first to load it\n"
     return prompt
+def nudge_for_result(client, model, messages, extra_params=None, extra_result_data=None):
+    """Nudge an agent that finished without <result> tags to produce one.
+    This is a generator that yields SSE events (content, result_preview, result).
+    Call it after an agent's tool loop when no <result> was found.
+    Args:
+        client: OpenAI-compatible client
+        model: Model name
+        messages: Full message history (will be mutated — nudge message appended)
+        extra_params: Optional extra_body params for the LLM call
+        extra_result_data: Optional dict of extra fields to include in result events
+                           (e.g. {"figures": {...}} or {"images": {...}})
+    """
+    import re
+    import logging
+    _logger = logging.getLogger(__name__)
+    messages.append({
+        "role": "user",
+        "content": "Please provide your final answer now. Wrap it in <result> tags."
+    })
+    try:
+        call_params = {"messages": messages, "model": model}
+        if extra_params:
+            call_params["extra_body"] = extra_params
+        response = client.chat.completions.create(**call_params)
+        nudge_content = response.choices[0].message.content or ""
+        result_match = re.search(r'<result>(.*?)</result>', nudge_content, re.DOTALL | re.IGNORECASE)
+        extra = extra_result_data or {}
+        if result_match:
+            result_content = result_match.group(1).strip()
+            thinking = re.sub(r'<result>.*?</result>', '', nudge_content, flags=re.DOTALL | re.IGNORECASE).strip()
+            if thinking:
+                yield {"type": "content", "content": thinking}
+            yield {"type": "result_preview", "content": result_content, **extra}
+            yield {"type": "result", "content": result_content, **extra}
+        elif nudge_content.strip():
+            # No result tags but got content — use it as the result
+            yield {"type": "result_preview", "content": nudge_content.strip(), **extra}
+            yield {"type": "result", "content": nudge_content.strip(), **extra}
+    except Exception as e:
+        _logger.warning(f"Result nudge failed: {e}")
 def get_tools() -> list:
     """Get tool definitions for the command center."""
     return [

backend/code.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import List, Dict, Optional
 from e2b_code_interpreter import Sandbox
 from .tools import execute_code, upload_files, download_files
 logger = logging.getLogger(__name__)
@@ -204,7 +205,7 @@ def download_files_from_sandbox(sbx: Sandbox, files: List[Dict], files_root: str
     return "\n".join(results)
-def stream_code_execution(client, model: str, messages: List[Dict], sbx: Sandbox, files_root: str = None, extra_params: Optional[Dict] = None, abort_event=None):
     """
     Stream code execution results
@@ -429,11 +430,27 @@ def stream_code_execution(client, model: str, messages: List[Dict], sbx: Sandbox
                         }]
                     })
-                    messages.append({
-                        "role": "tool",
-                        "tool_call_id": tool_call.id,
-                        "content": output
-                    })
                 elif tool_call.function.name == "upload_files":
                     # Parse arguments
@@ -571,32 +588,8 @@ def stream_code_execution(client, model: str, messages: List[Dict], sbx: Sandbox
     # If agent finished without a <result>, nudge it for one
     if not has_result:
-        messages.append({
-            "role": "user",
-            "content": "Please provide your final answer now. Wrap it in <result> tags."
-        })
-        try:
-            call_params = {
-                "messages": messages,
-                "model": model,
-            }
-            if extra_params:
-                call_params["extra_body"] = extra_params
-            response = client.chat.completions.create(**call_params)
-            nudge_content = response.choices[0].message.content or ""
-            result_match = re.search(r'<result>(.*?)</result>', nudge_content, re.DOTALL | re.IGNORECASE)
-            if result_match:
-                result_content = result_match.group(1).strip()
-                thinking = re.sub(r'<result>.*?</result>', '', nudge_content, flags=re.DOTALL | re.IGNORECASE).strip()
-                if thinking:
-                    yield {"type": "content", "content": thinking}
-                yield {"type": "result_preview", "content": result_content, "figures": figure_data}
-                yield {"type": "result", "content": result_content, "figures": figure_data}
-            elif nudge_content.strip():
-                yield {"type": "result_preview", "content": nudge_content.strip(), "figures": figure_data}
-                yield {"type": "result", "content": nudge_content.strip(), "figures": figure_data}
-        except Exception as e:
-            logger.warning(f"Result nudge failed: {e}")
     # Send done signal
     yield {"type": "done"}

 from e2b_code_interpreter import Sandbox
 from .tools import execute_code, upload_files, download_files
+from .image import resize_image_for_vlm
 logger = logging.getLogger(__name__)
     return "\n".join(results)
+def stream_code_execution(client, model: str, messages: List[Dict], sbx: Sandbox, files_root: str = None, extra_params: Optional[Dict] = None, abort_event=None, multimodal: bool = False):
     """
     Stream code execution results
                         }]
                     })
+                    # Build tool response — include figures if multimodal
+                    if multimodal and images:
+                        tool_content = [{"type": "text", "text": output}]
+                        for img in images:
+                            if img["type"] in ("png", "jpeg"):
+                                vlm_img = resize_image_for_vlm(img["data"])
+                                tool_content.append({
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{vlm_img}"}
+                                })
+                        messages.append({
+                            "role": "tool",
+                            "tool_call_id": tool_call.id,
+                            "content": tool_content
+                        })
+                    else:
+                        messages.append({
+                            "role": "tool",
+                            "tool_call_id": tool_call.id,
+                            "content": output
+                        })
                 elif tool_call.function.name == "upload_files":
                     # Parse arguments
     # If agent finished without a <result>, nudge it for one
     if not has_result:
+        from .agents import nudge_for_result
+        yield from nudge_for_result(client, model, messages, extra_params=extra_params, extra_result_data={"figures": figure_data})
     # Send done signal
     yield {"type": "done"}

backend/image.py CHANGED Viewed

@@ -97,7 +97,7 @@ def execute_tool(tool_name: str, args: dict, hf_token: str, image_store: dict, i
     if tool_name == "generate_image":
         prompt = args.get("prompt", "")
         model = args.get("model") or default_gen_model or "black-forest-labs/FLUX.1-schnell"
-        base64_png = execute_generate_image(prompt, hf_token, model)
         if base64_png:
             image_counter += 1
@@ -112,7 +112,7 @@ def execute_tool(tool_name: str, args: dict, hf_token: str, image_store: dict, i
             }
         else:
             return {
-                "content": f"Failed to generate image. The model may be unavailable or the prompt may be invalid.",
                 "display": {"type": "generate_error", "prompt": prompt},
                 "image_counter": image_counter,
             }
@@ -138,7 +138,7 @@ def execute_tool(tool_name: str, args: dict, hf_token: str, image_store: dict, i
                 "image_counter": image_counter,
             }
-        base64_png = execute_edit_image(prompt, source_bytes, hf_token, model)
         if base64_png:
             image_counter += 1
@@ -153,7 +153,7 @@ def execute_tool(tool_name: str, args: dict, hf_token: str, image_store: dict, i
             }
         else:
             return {
-                "content": f"Failed to edit image. The model may be unavailable or the request may be invalid.",
                 "display": {"type": "edit_error", "source": source},
                 "image_counter": image_counter,
             }
@@ -174,8 +174,14 @@ def execute_tool(tool_name: str, args: dict, hf_token: str, image_store: dict, i
                 "image_counter": image_counter,
             }
         else:
             return {
-                "content": f"Failed to load image from '{source}'. Check that the path or URL is correct.",
                 "display": {"type": "read_image_error", "url": source},
                 "image_counter": image_counter,
             }
@@ -196,7 +202,8 @@ def stream_image_execution(
     image_edit_model: Optional[str] = None,
     extra_params: Optional[Dict] = None,
     abort_event=None,
-    files_root: str = None
 ):
     """
     Run the image agent tool-calling loop.
@@ -334,9 +341,8 @@ def stream_image_execution(
                 result = execute_tool(func_name, args, hf_token, image_store, image_counter, default_gen_model=image_gen_model, default_edit_model=image_edit_model, files_root=files_root)
                 image_counter = result.get("image_counter", image_counter)
-                # Build tool response message for LLM
-                if result.get("image"):
-                    # Resize image for VLM context to avoid token overflow
                     vlm_image = resize_image_for_vlm(result["image"])
                     tool_response_content = [
                         {"type": "text", "text": result["content"]},
@@ -385,12 +391,18 @@ def stream_image_execution(
         if not done:
             yield {"type": "generating"}
-    # Fallback: if VLM never produced a <result> tag, synthesize one with all images
     if not result_sent and image_store:
-        fallback_parts = []
-        for name in image_store:
-            fallback_parts.append(f"<{name}>")
-        fallback_content = "\n\n".join(fallback_parts)
-        yield {"type": "result", "content": fallback_content, "images": image_store}
     yield {"type": "done"}

     if tool_name == "generate_image":
         prompt = args.get("prompt", "")
         model = args.get("model") or default_gen_model or "black-forest-labs/FLUX.1-schnell"
+        base64_png, error = execute_generate_image(prompt, hf_token, model)
         if base64_png:
             image_counter += 1
             }
         else:
             return {
+                "content": f"Failed to generate image: {error}",
                 "display": {"type": "generate_error", "prompt": prompt},
                 "image_counter": image_counter,
             }
                 "image_counter": image_counter,
             }
+        base64_png, error = execute_edit_image(prompt, source_bytes, hf_token, model)
         if base64_png:
             image_counter += 1
             }
         else:
             return {
+                "content": f"Failed to edit image: {error}",
                 "display": {"type": "edit_error", "source": source},
                 "image_counter": image_counter,
             }
                 "image_counter": image_counter,
             }
         else:
+            # Provide more specific error for SVG files
+            is_svg = source.lower().endswith(".svg") or "/svg" in source.lower()
+            if is_svg:
+                error_msg = f"Failed to load image from '{source}'. SVG format is not supported — only raster formats (PNG, JPEG, GIF, WebP, BMP) are accepted. Ask the user for a raster version of the image."
+            else:
+                error_msg = f"Failed to load image from '{source}'. Check that the path or URL is correct and that it is a raster image (PNG, JPEG, GIF, WebP, BMP)."
             return {
+                "content": error_msg,
                 "display": {"type": "read_image_error", "url": source},
                 "image_counter": image_counter,
             }
     image_edit_model: Optional[str] = None,
     extra_params: Optional[Dict] = None,
     abort_event=None,
+    files_root: str = None,
+    multimodal: bool = False
 ):
     """
     Run the image agent tool-calling loop.
                 result = execute_tool(func_name, args, hf_token, image_store, image_counter, default_gen_model=image_gen_model, default_edit_model=image_edit_model, files_root=files_root)
                 image_counter = result.get("image_counter", image_counter)
+                # Build tool response content for LLM
+                if result.get("image") and multimodal:
                     vlm_image = resize_image_for_vlm(result["image"])
                     tool_response_content = [
                         {"type": "text", "text": result["content"]},
         if not done:
             yield {"type": "generating"}
+    # If agent finished without a <result>, nudge it for one
     if not result_sent and image_store:
+        from .agents import nudge_for_result
+        nudge_produced_result = False
+        for event in nudge_for_result(client, model, messages, extra_params=extra_params, extra_result_data={"images": image_store}):
+            yield event
+            if event.get("type") == "result":
+                nudge_produced_result = True
+        # Final fallback: synthesize a result with all images
+        if not nudge_produced_result:
+            fallback_parts = [f"<{name}>" for name in image_store]
+            yield {"type": "result", "content": "\n\n".join(fallback_parts), "images": image_store}
     yield {"type": "done"}

backend/tools.py CHANGED Viewed

@@ -419,7 +419,7 @@ read_image = {
     "type": "function",
     "function": {
         "name": "read_image",
-        "description": "Load an image from a URL or local file path. Returns an image reference name (e.g., 'image_1') that you can see and use with edit_image.",
         "parameters": {
             "type": "object",
             "properties": {
@@ -441,48 +441,56 @@ read_image_url = read_image
 # Image tool execution functions
 # ============================================================
-def execute_generate_image(prompt: str, hf_token: str, model: str = "black-forest-labs/FLUX.1-schnell") -> Optional[str]:
-    """Text-to-image via HF InferenceClient. Returns base64 PNG or None on error."""
     try:
         from huggingface_hub import InferenceClient
     except ImportError:
-        logger.error("huggingface_hub not installed")
-        return None
     try:
         client = InferenceClient(token=hf_token)
         image = client.text_to_image(prompt, model=model)
         buffer = io.BytesIO()
         image.save(buffer, format="PNG")
-        return base64.b64encode(buffer.getvalue()).decode("utf-8")
     except Exception as e:
         logger.error(f"Generate image error: {e}")
-        return None
-def execute_edit_image(prompt: str, source_image_bytes: bytes, hf_token: str, model: str = "black-forest-labs/FLUX.1-Kontext-dev") -> Optional[str]:
-    """Image-to-image via HF InferenceClient. source_image_bytes is raw image data. Returns base64 PNG or None."""
     try:
         from huggingface_hub import InferenceClient
         from PIL import Image
     except ImportError:
-        logger.error("huggingface_hub or Pillow not installed")
-        return None
     try:
         client = InferenceClient(token=hf_token)
         input_image = Image.open(io.BytesIO(source_image_bytes))
         result = client.image_to_image(input_image, prompt=prompt, model=model)
         buffer = io.BytesIO()
         result.save(buffer, format="PNG")
-        return base64.b64encode(buffer.getvalue()).decode("utf-8")
     except Exception as e:
         logger.error(f"Edit image error: {e}")
-        return None
 def execute_read_image(source: str, files_root: str = None) -> Optional[str]:
-    """Load image from URL or local file path, return base64 string or None on error."""
     import os
     # Check if it's a URL
@@ -523,6 +531,41 @@ def execute_read_image(source: str, files_root: str = None) -> Optional[str]:
         return None
 # Keep old name as alias
 def execute_read_image_url(url: str) -> Optional[str]:
     return execute_read_image(url)

     "type": "function",
     "function": {
         "name": "read_image",
+        "description": "Load a raster image (PNG, JPEG, GIF, WebP, BMP) from a URL or local file path. SVG is NOT supported. Returns an image reference name (e.g., 'image_1') that you can see and use with edit_image.",
         "parameters": {
             "type": "object",
             "properties": {
 # Image tool execution functions
 # ============================================================
+def execute_generate_image(prompt: str, hf_token: str, model: str = "black-forest-labs/FLUX.1-schnell") -> tuple:
+    """Text-to-image via HF InferenceClient. Returns (base64_png, None) on success or (None, error_str) on failure."""
     try:
         from huggingface_hub import InferenceClient
     except ImportError:
+        return None, "huggingface_hub not installed"
     try:
         client = InferenceClient(token=hf_token)
         image = client.text_to_image(prompt, model=model)
         buffer = io.BytesIO()
         image.save(buffer, format="PNG")
+        return base64.b64encode(buffer.getvalue()).decode("utf-8"), None
     except Exception as e:
         logger.error(f"Generate image error: {e}")
+        return None, str(e)
+def execute_edit_image(prompt: str, source_image_bytes: bytes, hf_token: str, model: str = "black-forest-labs/FLUX.1-Kontext-dev") -> tuple:
+    """Image-to-image via HF InferenceClient. Returns (base64_png, None) on success or (None, error_str) on failure."""
     try:
         from huggingface_hub import InferenceClient
         from PIL import Image
     except ImportError:
+        return None, "huggingface_hub or Pillow not installed"
     try:
         client = InferenceClient(token=hf_token)
         input_image = Image.open(io.BytesIO(source_image_bytes))
+        # Resize large images to avoid API failures (most models expect ~1024px)
+        MAX_EDIT_DIM = 1024
+        if max(input_image.size) > MAX_EDIT_DIM:
+            input_image.thumbnail((MAX_EDIT_DIM, MAX_EDIT_DIM), Image.LANCZOS)
+            logger.info(f"Resized input image to {input_image.size} for editing")
         result = client.image_to_image(input_image, prompt=prompt, model=model)
         buffer = io.BytesIO()
         result.save(buffer, format="PNG")
+        return base64.b64encode(buffer.getvalue()).decode("utf-8"), None
     except Exception as e:
         logger.error(f"Edit image error: {e}")
+        return None, str(e)
 def execute_read_image(source: str, files_root: str = None) -> Optional[str]:
+    """Load image from URL or local file path, return base64 string or None on error.
+    Supported formats: PNG, JPEG, GIF, WebP, BMP. SVG is NOT supported.
+    """
     import os
     # Check if it's a URL
         return None
+def extract_and_download_images(markdown: str, max_images: int = 5) -> List[str]:
+    """Extract image URLs from markdown and download them as base64 strings.
+    Returns list of base64-encoded image strings (PNG/JPEG).
+    Skips SVGs, data URIs, and failed downloads.
+    """
+    import re as _re
+    img_pattern = _re.compile(r'!\[[^\]]*\]\(([^)]+)\)')
+    urls = img_pattern.findall(markdown)
+    results = []
+    for url in urls:
+        if len(results) >= max_images:
+            break
+        if url.startswith("data:") or url.endswith(".svg"):
+            continue
+        try:
+            resp = httpx.get(
+                url,
+                follow_redirects=True,
+                timeout=10,
+                headers={"User-Agent": _USER_AGENT}
+            )
+            if resp.status_code != 200:
+                continue
+            ct = resp.headers.get("content-type", "")
+            if not ct.startswith("image/"):
+                continue
+            results.append(base64.b64encode(resp.content).decode("utf-8"))
+        except Exception:
+            continue
+    return results
 # Keep old name as alias
 def execute_read_image_url(url: str) -> Optional[str]:
     return execute_read_image(url)

frontend/script.js CHANGED Viewed

@@ -93,6 +93,10 @@ const actionWidgets = {};
 // Track tool call IDs for result updates (maps tabId -> tool_call_id)
 const toolCallIds = {};
 // Track agents by task_id for reuse (maps task_id -> tabId)
 const taskIdToTabId = {};
@@ -128,6 +132,7 @@ function resetLocalState() {
     // Clear object maps
     Object.keys(actionWidgets).forEach(k => delete actionWidgets[k]);
     Object.keys(toolCallIds).forEach(k => delete toolCallIds[k]);
     Object.keys(taskIdToTabId).forEach(k => delete taskIdToTabId[k]);
     researchQueryTabIds = {};
     showAllTurns = true;
@@ -2021,6 +2026,7 @@ async function streamChatResponse(messages, chatContainer, agentType, tabId) {
                 token: modelConfig.token || null,
                 model: modelConfig.model,
                 extra_params: modelConfig.extraParams || null,
                 e2b_key: currentSettings.e2bKey || null,
                 serper_key: currentSettings.serperKey || null,
                 hf_token: currentSettings.hfToken || null,
@@ -2084,7 +2090,7 @@ async function streamChatResponse(messages, chatContainer, agentType, tabId) {
                             currentMessageEl = createAssistantMessage(chatContainer);
                         }
                         fullResponse += data.content;
-                        appendToMessage(currentMessageEl, parseMarkdown(fullResponse));
                         scrollChatToBottom(chatContainer);
                     } else if (data.type === 'code') {
@@ -2128,6 +2134,22 @@ async function streamChatResponse(messages, chatContainer, agentType, tabId) {
                         // Still generating - no action needed
                     } else if (data.type === 'result') {
                         // Agent result - update command center widget
                         updateActionWidgetWithResult(tabId, data.content, data.figures, data.images);
@@ -2175,7 +2197,7 @@ async function streamChatResponse(messages, chatContainer, agentType, tabId) {
                         for (const [placeholderId, figureData] of Object.entries(figurePlaceholders)) {
                             let imageHtml = '';
                             if (figureData.type === 'png' || figureData.type === 'jpeg') {
-                                imageHtml = `<img src="data:image/${figureData.type};base64,${figureData.data}" style="max-width: 400px; height: auto; border-radius: 4px; margin: 12px 0; display: block;" onclick="openImageModal(this.src)">`;
                             } else if (figureData.type === 'svg') {
                                 imageHtml = `<div style="margin: 12px 0;">${atob(figureData.data)}</div>`;
                             }
@@ -2371,7 +2393,8 @@ async function streamChatResponse(messages, chatContainer, agentType, tabId) {
                             const imgName = data.image_name || 'image';
                             outputHtml = `<img src="data:image/png;base64,${data.image}" alt="${escapeHtml(imgName)}" class="generated-img" />`;
                         } else if ((data.tool === 'generate_image' || data.tool === 'edit_image' || data.tool === 'read_image_url' || data.tool === 'read_image') && !data.image) {
-                            outputHtml = `<div class="tool-cell-read-summary">Failed to process image</div>`;
                         }
                         if (outputHtml && lastToolCell) {
@@ -2389,7 +2412,7 @@ async function streamChatResponse(messages, chatContainer, agentType, tabId) {
                             currentMessageEl = createAssistantMessage(chatContainer);
                         }
                         fullResponse += data.content;
-                        appendToMessage(currentMessageEl, parseMarkdown(fullResponse));
                         scrollChatToBottom(chatContainer);
                     } else if (data.type === 'launch') {
@@ -2521,6 +2544,16 @@ async function streamChatResponse(messages, chatContainer, agentType, tabId) {
                         errorDiv.innerHTML = `<div class="message-content" style="color: #c62828;">Error: ${escapeHtml(data.content)}</div>`;
                         chatContainer.appendChild(errorDiv);
                         scrollChatToBottom(chatContainer);
                     }
                 }
             }
@@ -2865,7 +2898,7 @@ async function updateActionWidgetWithResult(tabId, resultContent, figures, image
     for (const [placeholderId, figureData] of Object.entries(figurePlaceholders)) {
         let imageHtml = '';
         if (figureData.type === 'png' || figureData.type === 'jpeg') {
-            imageHtml = `<img src="data:image/${figureData.type};base64,${figureData.data}" style="max-width: 400px; height: auto; border-radius: 4px; margin: 12px 0; display: block;" onclick="openImageModal(this.src)">`;
         } else if (figureData.type === 'svg') {
             imageHtml = `<div style="margin: 12px 0;">${atob(figureData.data)}</div>`;
         }
@@ -3115,6 +3148,22 @@ if (typeof marked !== 'undefined') {
     });
 }
 function parseMarkdown(text) {
     // Use marked library for proper markdown parsing
     let html;
@@ -4128,7 +4177,7 @@ function deleteProvider(providerId) {
 // Show add/edit model dialog
 function showModelDialog(modelId = null) {
     const isEdit = !!modelId;
-    const model = isEdit ? settings.models[modelId] : { name: '', providerId: '', modelId: '', extraParams: null };
     const dialog = document.getElementById('model-dialog');
     const title = document.getElementById('model-dialog-title');
@@ -4136,11 +4185,13 @@ function showModelDialog(modelId = null) {
     const providerSelect = document.getElementById('model-provider');
     const modelIdInput = document.getElementById('model-model-id');
     const extraParamsInput = document.getElementById('model-extra-params');
     title.textContent = isEdit ? 'Edit Model' : 'Add Model';
     nameInput.value = model.name;
     modelIdInput.value = model.modelId;
     extraParamsInput.value = model.extraParams ? JSON.stringify(model.extraParams, null, 2) : '';
     // Populate provider dropdown
     providerSelect.innerHTML = '<option value="">-- Select Provider --</option>';
@@ -4187,7 +4238,8 @@ function saveModelFromDialog() {
         }
     }
-    settings.models[modelId] = { name, providerId, modelId: apiModelId, extraParams };
     hideModelDialog();
     renderModelsList();
     populateModelDropdowns();
@@ -4492,7 +4544,8 @@ function resolveModelConfig(agentType) {
         endpoint: provider.endpoint,
         token: provider.token,
         model: model.modelId,
-        extraParams: model.extraParams || null
     };
 }
@@ -4510,7 +4563,8 @@ function getDefaultModelConfig() {
         endpoint: provider.endpoint,
         token: provider.token,
         model: model.modelId,
-        extraParams: model.extraParams || null
     };
 }

 // Track tool call IDs for result updates (maps tabId -> tool_call_id)
 const toolCallIds = {};
+// Global figure/image registry populated by sub-agents for cross-agent reference resolution
+// Maps "figure_1" -> {type, data} and "image_1" -> {type: "png", data: base64}
+const globalFigureRegistry = {};
 // Track agents by task_id for reuse (maps task_id -> tabId)
 const taskIdToTabId = {};
     // Clear object maps
     Object.keys(actionWidgets).forEach(k => delete actionWidgets[k]);
     Object.keys(toolCallIds).forEach(k => delete toolCallIds[k]);
+    Object.keys(globalFigureRegistry).forEach(k => delete globalFigureRegistry[k]);
     Object.keys(taskIdToTabId).forEach(k => delete taskIdToTabId[k]);
     researchQueryTabIds = {};
     showAllTurns = true;
                 token: modelConfig.token || null,
                 model: modelConfig.model,
                 extra_params: modelConfig.extraParams || null,
+                multimodal: modelConfig.multimodal || false,
                 e2b_key: currentSettings.e2bKey || null,
                 serper_key: currentSettings.serperKey || null,
                 hf_token: currentSettings.hfToken || null,
                             currentMessageEl = createAssistantMessage(chatContainer);
                         }
                         fullResponse += data.content;
+                        appendToMessage(currentMessageEl, resolveGlobalFigureRefs(parseMarkdown(fullResponse)));
                         scrollChatToBottom(chatContainer);
                     } else if (data.type === 'code') {
                         // Still generating - no action needed
                     } else if (data.type === 'result') {
+                        // Populate global figure/image registry only for items referenced in result content
+                        const resultText = data.content || '';
+                        if (data.figures) {
+                            for (const [name, figData] of Object.entries(data.figures)) {
+                                if (new RegExp(`</?${name}>`, 'i').test(resultText)) {
+                                    globalFigureRegistry[name] = figData;
+                                }
+                            }
+                        }
+                        if (data.images) {
+                            for (const [name, imgBase64] of Object.entries(data.images)) {
+                                if (new RegExp(`</?${name}>`, 'i').test(resultText)) {
+                                    globalFigureRegistry[name] = { type: 'png', data: imgBase64 };
+                                }
+                            }
+                        }
                         // Agent result - update command center widget
                         updateActionWidgetWithResult(tabId, data.content, data.figures, data.images);
                         for (const [placeholderId, figureData] of Object.entries(figurePlaceholders)) {
                             let imageHtml = '';
                             if (figureData.type === 'png' || figureData.type === 'jpeg') {
+                                imageHtml = `<img src="data:image/${figureData.type};base64,${figureData.data}" style="max-width: 400px; max-height: 400px; height: auto; border-radius: 4px; margin: 12px 0; display: block;" onclick="openImageModal(this.src)">`;
                             } else if (figureData.type === 'svg') {
                                 imageHtml = `<div style="margin: 12px 0;">${atob(figureData.data)}</div>`;
                             }
                             const imgName = data.image_name || 'image';
                             outputHtml = `<img src="data:image/png;base64,${data.image}" alt="${escapeHtml(imgName)}" class="generated-img" />`;
                         } else if ((data.tool === 'generate_image' || data.tool === 'edit_image' || data.tool === 'read_image_url' || data.tool === 'read_image') && !data.image) {
+                            const errMsg = data.response || 'Failed to process image';
+                            outputHtml = `<div class="tool-cell-read-summary">${escapeHtml(errMsg)}</div>`;
                         }
                         if (outputHtml && lastToolCell) {
                             currentMessageEl = createAssistantMessage(chatContainer);
                         }
                         fullResponse += data.content;
+                        appendToMessage(currentMessageEl, resolveGlobalFigureRefs(parseMarkdown(fullResponse)));
                         scrollChatToBottom(chatContainer);
                     } else if (data.type === 'launch') {
                         errorDiv.innerHTML = `<div class="message-content" style="color: #c62828;">Error: ${escapeHtml(data.content)}</div>`;
                         chatContainer.appendChild(errorDiv);
                         scrollChatToBottom(chatContainer);
+                        // Propagate error to parent action widget
+                        updateActionWidgetWithResult(tabId, `Error: ${data.content}`, {}, {});
+                        const errorWidget = actionWidgets[tabId];
+                        if (errorWidget) {
+                            const doneIndicator = errorWidget.querySelector('.done-indicator');
+                            if (doneIndicator) {
+                                doneIndicator.classList.add('errored');
+                            }
+                        }
                     }
                 }
             }
     for (const [placeholderId, figureData] of Object.entries(figurePlaceholders)) {
         let imageHtml = '';
         if (figureData.type === 'png' || figureData.type === 'jpeg') {
+            imageHtml = `<img src="data:image/${figureData.type};base64,${figureData.data}" style="max-width: 400px; max-height: 400px; height: auto; border-radius: 4px; margin: 12px 0; display: block;" onclick="openImageModal(this.src)">`;
         } else if (figureData.type === 'svg') {
             imageHtml = `<div style="margin: 12px 0;">${atob(figureData.data)}</div>`;
         }
     });
 }
+// Resolve <figure_N> and <image_N> references using the global registry
+function resolveGlobalFigureRefs(html) {
+    return html.replace(/<\/?(figure_\d+|image_\d+)>/gi, (match) => {
+        // Extract the name (strip < > and /)
+        const name = match.replace(/[<>/]/g, '');
+        const data = globalFigureRegistry[name];
+        if (!data) return match; // Leave unresolved refs as-is
+        if (data.type === 'png' || data.type === 'jpeg') {
+            return `<img src="data:image/${data.type};base64,${data.data}" style="max-width: 400px; max-height: 400px; height: auto; border-radius: 4px; margin: 12px 0; display: block;" onclick="openImageModal(this.src)">`;
+        } else if (data.type === 'svg') {
+            return `<div style="margin: 12px 0;">${atob(data.data)}</div>`;
+        }
+        return match;
+    });
+}
 function parseMarkdown(text) {
     // Use marked library for proper markdown parsing
     let html;
 // Show add/edit model dialog
 function showModelDialog(modelId = null) {
     const isEdit = !!modelId;
+    const model = isEdit ? settings.models[modelId] : { name: '', providerId: '', modelId: '', extraParams: null, multimodal: false };
     const dialog = document.getElementById('model-dialog');
     const title = document.getElementById('model-dialog-title');
     const providerSelect = document.getElementById('model-provider');
     const modelIdInput = document.getElementById('model-model-id');
     const extraParamsInput = document.getElementById('model-extra-params');
+    const multimodalCheckbox = document.getElementById('model-multimodal');
     title.textContent = isEdit ? 'Edit Model' : 'Add Model';
     nameInput.value = model.name;
     modelIdInput.value = model.modelId;
     extraParamsInput.value = model.extraParams ? JSON.stringify(model.extraParams, null, 2) : '';
+    multimodalCheckbox.checked = !!model.multimodal;
     // Populate provider dropdown
     providerSelect.innerHTML = '<option value="">-- Select Provider --</option>';
         }
     }
+    const multimodal = document.getElementById('model-multimodal').checked;
+    settings.models[modelId] = { name, providerId, modelId: apiModelId, extraParams, multimodal };
     hideModelDialog();
     renderModelsList();
     populateModelDropdowns();
         endpoint: provider.endpoint,
         token: provider.token,
         model: model.modelId,
+        extraParams: model.extraParams || null,
+        multimodal: !!model.multimodal
     };
 }
         endpoint: provider.endpoint,
         token: provider.token,
         model: model.modelId,
+        extraParams: model.extraParams || null,
+        multimodal: !!model.multimodal
     };
 }

frontend/style.css CHANGED Viewed

@@ -1129,6 +1129,20 @@ body {
     margin: 16px 0;
 }
 /* LaTeX / KaTeX */
 .message-content .katex-display {
     margin: 12px 0;
@@ -1298,6 +1312,7 @@ pre code [class*="token"] {
 .code-cell-image img {
     max-width: 400px;
     height: auto;
     border-radius: 4px;
     cursor: pointer;
@@ -1798,6 +1813,22 @@ pre code [class*="token"] {
     color: var(--bg-primary);
 }
 .action-widget-body {
     padding: 12px;
     background: var(--bg-tertiary);
@@ -1834,6 +1865,7 @@ pre code [class*="token"] {
 .action-widget .section-content img,
 .action-widget img {
     max-width: 400px !important;
     width: auto !important;
     height: auto !important;
     margin: 8px 0;
@@ -1896,6 +1928,7 @@ pre code [class*="token"] {
 .action-widget-result img {
     max-width: 400px;
     height: auto;
     margin: 8px 0;
     border-radius: 3px;
@@ -2598,6 +2631,7 @@ pre code [class*="token"] {
 .result-content img {
     max-width: 400px;
     height: auto;
     margin: 8px 0;
     border-radius: 3px;
@@ -3192,6 +3226,19 @@ pre code [class*="token"] {
     font-style: italic;
 }
 .settings-dialog-actions {
     display: flex;
     gap: 6px;

     margin: 16px 0;
 }
+.message-content img {
+    max-width: 400px;
+    max-height: 400px;
+    height: auto;
+    margin: 8px 0;
+    border-radius: 3px;
+    cursor: pointer;
+    transition: opacity 0.2s;
+}
+.message-content img:hover {
+    opacity: 0.85;
+}
 /* LaTeX / KaTeX */
 .message-content .katex-display {
     margin: 12px 0;
 .code-cell-image img {
     max-width: 400px;
+    max-height: 400px;
     height: auto;
     border-radius: 4px;
     cursor: pointer;
     color: var(--bg-primary);
 }
+/* Errored action widget - red background with exclamation */
+.action-widget .done-indicator.errored {
+    background: #c62828;
+}
+.action-widget .done-indicator.errored::before {
+    content: '!';
+    width: auto;
+    height: auto;
+    border: none;
+    transform: none;
+    font-size: 11px;
+    font-weight: bold;
+    line-height: 1;
+    color: white;
+}
 .action-widget-body {
     padding: 12px;
     background: var(--bg-tertiary);
 .action-widget .section-content img,
 .action-widget img {
     max-width: 400px !important;
+    max-height: 400px !important;
     width: auto !important;
     height: auto !important;
     margin: 8px 0;
 .action-widget-result img {
     max-width: 400px;
+    max-height: 400px;
     height: auto;
     margin: 8px 0;
     border-radius: 3px;
 .result-content img {
     max-width: 400px;
+    max-height: 400px;
     height: auto;
     margin: 8px 0;
     border-radius: 3px;
     font-style: italic;
 }
+.dialog-checkbox-label {
+    font-size: 11px;
+    color: var(--text-secondary);
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    cursor: pointer;
+}
+.dialog-checkbox-label input[type="checkbox"] {
+    margin: 0;
+}
 .settings-dialog-actions {
     display: flex;
     gap: 6px;