File size: 6,959 Bytes
8d1819a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from typing import Any
from browser_use.llm import ChatGoogle
from python.helpers import dirty_json


# ------------------------------------------------------------------------------
# Gemini Helper for Output Conformance
# ------------------------------------------------------------------------------
# This function sanitizes and conforms the JSON output from Gemini to match
# the specific schema expectations of the browser-use library. It handles
# markdown fences, aliases actions (like 'complete_task' to 'done'), and
# intelligently constructs a valid 'data' object for the final action.

def gemini_clean_and_conform(text: str):
    obj = None
    try:
        # dirty_json parser is robust enough to handle markdown fences
        obj = dirty_json.parse(text)
    except Exception:
        return None  # return None if parsing fails

    if not isinstance(obj, dict):
        return None

    # Conform actions to browser-use expectations
    if isinstance(obj.get("action"), list):
        normalized_actions = []
        for item in obj["action"]:
            if not isinstance(item, dict):
                continue  # Skip non-dict items

            action_key, action_value = next(iter(item.items()), (None, None))
            if not action_key:
                continue

            # Alias 'complete_task' to 'done' to handle inconsistencies
            if action_key == "complete_task":
                action_key = "done"

            # Create a mutable copy of the value
            v = (action_value or {}).copy()

            if action_key in ("scroll_down", "scroll_up", "scroll"):
                is_down = action_key != "scroll_up"
                v.setdefault("down", is_down)
                v.setdefault("num_pages", 1.0)
                normalized_actions.append({"scroll": v})
            elif action_key == "go_to_url":
                v.setdefault("new_tab", False)
                normalized_actions.append({action_key: v})
            elif action_key == "done":
                # If `data` is missing, construct it from other keys
                if "data" not in v:
                    # Pop fields from the top-level `done` object
                    response_text = v.pop("response", None)
                    summary_text = v.pop("page_summary", None)
                    title_text = v.pop("title", "Task Completed")

                    final_response = response_text or "Task completed successfully." # browser-use expects string
                    final_summary = summary_text or "No page summary available." # browser-use expects string

                    v["data"] = {
                        "title": title_text,
                        "response": final_response,
                        "page_summary": final_summary,
                    }

                v.setdefault("success", True)
                normalized_actions.append({action_key: v})
            else:
                normalized_actions.append(item)
        obj["action"] = normalized_actions

    return dirty_json.stringify(obj)

# ------------------------------------------------------------------------------
# Monkey-patch for browser-use Gemini schema issue
# ------------------------------------------------------------------------------
# The original _fix_gemini_schema in browser_use.llm.google.chat.ChatGoogle
# removes the 'title' property but fails to remove it from the 'required' list,
# causing a validation error with the Gemini API. This patch corrects that behavior.

def _patched_fix_gemini_schema(self, schema: dict[str, Any]) -> dict[str, Any]:
    """
    Convert a Pydantic model to a Gemini-compatible schema.

    This function removes unsupported properties like 'additionalProperties' and resolves
    $ref references that Gemini doesn't support.
    """

    # Handle $defs and $ref resolution
    if '$defs' in schema:
        defs = schema.pop('$defs')

        def resolve_refs(obj: Any) -> Any:
            if isinstance(obj, dict):
                if '$ref' in obj:
                    ref = obj.pop('$ref')
                    ref_name = ref.split('/')[-1]
                    if ref_name in defs:
                        # Replace the reference with the actual definition
                        resolved = defs[ref_name].copy()
                        # Merge any additional properties from the reference
                        for key, value in obj.items():
                            if key != '$ref':
                                resolved[key] = value
                        return resolve_refs(resolved)
                    return obj
                else:
                    # Recursively process all dictionary values
                    return {k: resolve_refs(v) for k, v in obj.items()}
            elif isinstance(obj, list):
                return [resolve_refs(item) for item in obj]
            return obj

        schema = resolve_refs(schema)

    # Remove unsupported properties
    def clean_schema(obj: Any) -> Any:
        if isinstance(obj, dict):
            # Remove unsupported properties
            cleaned = {}
            for key, value in obj.items():
                if key not in ['additionalProperties', 'title', 'default']:
                    cleaned_value = clean_schema(value)
                    # Handle empty object properties - Gemini doesn't allow empty OBJECT types
                    if (
                        key == 'properties'
                        and isinstance(cleaned_value, dict)
                        and len(cleaned_value) == 0
                        and isinstance(obj.get('type', ''), str)
                        and obj.get('type', '').upper() == 'OBJECT'
                    ):
                        # Convert empty object to have at least one property
                        cleaned['properties'] = {'_placeholder': {'type': 'string'}}
                    else:
                        cleaned[key] = cleaned_value

            # If this is an object type with empty properties, add a placeholder
            if (
                isinstance(cleaned.get('type', ''), str)
                and cleaned.get('type', '').upper() == 'OBJECT'
                and 'properties' in cleaned
                and isinstance(cleaned['properties'], dict)
                and len(cleaned['properties']) == 0
            ):
                cleaned['properties'] = {'_placeholder': {'type': 'string'}}

            # PATCH: Also remove 'title' from the required list if it exists
            if 'required' in cleaned and isinstance(cleaned.get('required'), list):
                cleaned['required'] = [p for p in cleaned['required'] if p != 'title']

            return cleaned
        elif isinstance(obj, list):
            return [clean_schema(item) for item in obj]
        return obj

    return clean_schema(schema)

def apply():
    """Applies the monkey-patch to ChatGoogle."""
    ChatGoogle._fix_gemini_schema = _patched_fix_gemini_schema