Spaces:
Paused
Paused
File size: 6,959 Bytes
8d1819a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
from typing import Any
from browser_use.llm import ChatGoogle
from python.helpers import dirty_json
# ------------------------------------------------------------------------------
# Gemini Helper for Output Conformance
# ------------------------------------------------------------------------------
# This function sanitizes and conforms the JSON output from Gemini to match
# the specific schema expectations of the browser-use library. It handles
# markdown fences, aliases actions (like 'complete_task' to 'done'), and
# intelligently constructs a valid 'data' object for the final action.
def gemini_clean_and_conform(text: str):
obj = None
try:
# dirty_json parser is robust enough to handle markdown fences
obj = dirty_json.parse(text)
except Exception:
return None # return None if parsing fails
if not isinstance(obj, dict):
return None
# Conform actions to browser-use expectations
if isinstance(obj.get("action"), list):
normalized_actions = []
for item in obj["action"]:
if not isinstance(item, dict):
continue # Skip non-dict items
action_key, action_value = next(iter(item.items()), (None, None))
if not action_key:
continue
# Alias 'complete_task' to 'done' to handle inconsistencies
if action_key == "complete_task":
action_key = "done"
# Create a mutable copy of the value
v = (action_value or {}).copy()
if action_key in ("scroll_down", "scroll_up", "scroll"):
is_down = action_key != "scroll_up"
v.setdefault("down", is_down)
v.setdefault("num_pages", 1.0)
normalized_actions.append({"scroll": v})
elif action_key == "go_to_url":
v.setdefault("new_tab", False)
normalized_actions.append({action_key: v})
elif action_key == "done":
# If `data` is missing, construct it from other keys
if "data" not in v:
# Pop fields from the top-level `done` object
response_text = v.pop("response", None)
summary_text = v.pop("page_summary", None)
title_text = v.pop("title", "Task Completed")
final_response = response_text or "Task completed successfully." # browser-use expects string
final_summary = summary_text or "No page summary available." # browser-use expects string
v["data"] = {
"title": title_text,
"response": final_response,
"page_summary": final_summary,
}
v.setdefault("success", True)
normalized_actions.append({action_key: v})
else:
normalized_actions.append(item)
obj["action"] = normalized_actions
return dirty_json.stringify(obj)
# ------------------------------------------------------------------------------
# Monkey-patch for browser-use Gemini schema issue
# ------------------------------------------------------------------------------
# The original _fix_gemini_schema in browser_use.llm.google.chat.ChatGoogle
# removes the 'title' property but fails to remove it from the 'required' list,
# causing a validation error with the Gemini API. This patch corrects that behavior.
def _patched_fix_gemini_schema(self, schema: dict[str, Any]) -> dict[str, Any]:
"""
Convert a Pydantic model to a Gemini-compatible schema.
This function removes unsupported properties like 'additionalProperties' and resolves
$ref references that Gemini doesn't support.
"""
# Handle $defs and $ref resolution
if '$defs' in schema:
defs = schema.pop('$defs')
def resolve_refs(obj: Any) -> Any:
if isinstance(obj, dict):
if '$ref' in obj:
ref = obj.pop('$ref')
ref_name = ref.split('/')[-1]
if ref_name in defs:
# Replace the reference with the actual definition
resolved = defs[ref_name].copy()
# Merge any additional properties from the reference
for key, value in obj.items():
if key != '$ref':
resolved[key] = value
return resolve_refs(resolved)
return obj
else:
# Recursively process all dictionary values
return {k: resolve_refs(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [resolve_refs(item) for item in obj]
return obj
schema = resolve_refs(schema)
# Remove unsupported properties
def clean_schema(obj: Any) -> Any:
if isinstance(obj, dict):
# Remove unsupported properties
cleaned = {}
for key, value in obj.items():
if key not in ['additionalProperties', 'title', 'default']:
cleaned_value = clean_schema(value)
# Handle empty object properties - Gemini doesn't allow empty OBJECT types
if (
key == 'properties'
and isinstance(cleaned_value, dict)
and len(cleaned_value) == 0
and isinstance(obj.get('type', ''), str)
and obj.get('type', '').upper() == 'OBJECT'
):
# Convert empty object to have at least one property
cleaned['properties'] = {'_placeholder': {'type': 'string'}}
else:
cleaned[key] = cleaned_value
# If this is an object type with empty properties, add a placeholder
if (
isinstance(cleaned.get('type', ''), str)
and cleaned.get('type', '').upper() == 'OBJECT'
and 'properties' in cleaned
and isinstance(cleaned['properties'], dict)
and len(cleaned['properties']) == 0
):
cleaned['properties'] = {'_placeholder': {'type': 'string'}}
# PATCH: Also remove 'title' from the required list if it exists
if 'required' in cleaned and isinstance(cleaned.get('required'), list):
cleaned['required'] = [p for p in cleaned['required'] if p != 'title']
return cleaned
elif isinstance(obj, list):
return [clean_schema(item) for item in obj]
return obj
return clean_schema(schema)
def apply():
"""Applies the monkey-patch to ChatGoogle."""
ChatGoogle._fix_gemini_schema = _patched_fix_gemini_schema
|