Merge pull request #3 from convergence-ai/alex/descriptions
Browse files- .gitignore +2 -1
- src/proxy_lite/cli.py +17 -1
- src/proxy_lite/client.py +6 -6
- src/proxy_lite/configs/default.yaml +3 -0
- src/proxy_lite/gif_maker.py +122 -0
- src/proxy_lite/recorder.py +6 -2
- src/proxy_lite/serializer.py +1 -1
.gitignore
CHANGED
|
@@ -171,4 +171,5 @@ cython_debug/
|
|
| 171 |
.pypirc
|
| 172 |
|
| 173 |
logs/
|
| 174 |
-
local_trajectories/
|
|
|
|
|
|
| 171 |
.pypirc
|
| 172 |
|
| 173 |
logs/
|
| 174 |
+
local_trajectories/
|
| 175 |
+
screenshots/
|
src/proxy_lite/cli.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
import argparse
|
| 2 |
import asyncio
|
|
|
|
| 3 |
import os
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Optional
|
| 6 |
|
| 7 |
from proxy_lite import Runner, RunnerConfig
|
|
|
|
| 8 |
from proxy_lite.logger import logger
|
| 9 |
|
| 10 |
|
|
@@ -35,7 +37,21 @@ def do_command(args):
|
|
| 35 |
if args.viewport_height:
|
| 36 |
config.viewport_height = args.viewport_height
|
| 37 |
o = Runner(config=config)
|
| 38 |
-
asyncio.run(o.run(do_text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
|
| 41 |
def main():
|
|
|
|
| 1 |
import argparse
|
| 2 |
import asyncio
|
| 3 |
+
import base64
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Optional
|
| 7 |
|
| 8 |
from proxy_lite import Runner, RunnerConfig
|
| 9 |
+
from proxy_lite.gif_maker import create_run_gif
|
| 10 |
from proxy_lite.logger import logger
|
| 11 |
|
| 12 |
|
|
|
|
| 37 |
if args.viewport_height:
|
| 38 |
config.viewport_height = args.viewport_height
|
| 39 |
o = Runner(config=config)
|
| 40 |
+
result = asyncio.run(o.run(do_text))
|
| 41 |
+
|
| 42 |
+
final_screenshot = result.observations[-1].info["original_image"]
|
| 43 |
+
folder_path = Path(__file__).parent.parent.parent / "screenshots"
|
| 44 |
+
folder_path.mkdir(parents=True, exist_ok=True)
|
| 45 |
+
path = folder_path / f"{result.run_id}.png"
|
| 46 |
+
with open(path, "wb") as f:
|
| 47 |
+
f.write(base64.b64decode(final_screenshot))
|
| 48 |
+
logger.info(f"🤖 Final screenshot saved to {path}")
|
| 49 |
+
|
| 50 |
+
gif_folder_path = Path(__file__).parent.parent.parent / "gifs"
|
| 51 |
+
gif_folder_path.mkdir(parents=True, exist_ok=True)
|
| 52 |
+
gif_path = gif_folder_path / f"{result.run_id}.gif"
|
| 53 |
+
create_run_gif(result, gif_path, duration=1500)
|
| 54 |
+
logger.info(f"🤖 GIF saved to {gif_path}")
|
| 55 |
|
| 56 |
|
| 57 |
def main():
|
src/proxy_lite/client.py
CHANGED
|
@@ -15,7 +15,7 @@ from proxy_lite.history import MessageHistory
|
|
| 15 |
from proxy_lite.logger import logger
|
| 16 |
from proxy_lite.serializer import (
|
| 17 |
BaseSerializer,
|
| 18 |
-
|
| 19 |
)
|
| 20 |
from proxy_lite.tools import Tool
|
| 21 |
|
|
@@ -78,7 +78,7 @@ class OpenAIClientConfig(BaseClientConfig):
|
|
| 78 |
|
| 79 |
class OpenAIClient(BaseClient):
|
| 80 |
config: OpenAIClientConfig
|
| 81 |
-
serializer: ClassVar[
|
| 82 |
|
| 83 |
@cached_property
|
| 84 |
def external_client(self) -> AsyncOpenAI:
|
|
@@ -119,14 +119,14 @@ class ConvergenceClientConfig(BaseClientConfig):
|
|
| 119 |
|
| 120 |
class ConvergenceClient(OpenAIClient):
|
| 121 |
config: ConvergenceClientConfig
|
| 122 |
-
serializer: ClassVar[
|
| 123 |
_model_validated: bool = False
|
| 124 |
|
| 125 |
async def _validate_model(self) -> None:
|
| 126 |
try:
|
| 127 |
-
await self.external_client.
|
| 128 |
-
|
| 129 |
-
|
| 130 |
)
|
| 131 |
self._model_validated = True
|
| 132 |
logger.debug(f"Model {self.config.model_id} validated and connected to cluster")
|
|
|
|
| 15 |
from proxy_lite.logger import logger
|
| 16 |
from proxy_lite.serializer import (
|
| 17 |
BaseSerializer,
|
| 18 |
+
OpenAICompatibleSerializer,
|
| 19 |
)
|
| 20 |
from proxy_lite.tools import Tool
|
| 21 |
|
|
|
|
| 78 |
|
| 79 |
class OpenAIClient(BaseClient):
|
| 80 |
config: OpenAIClientConfig
|
| 81 |
+
serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
|
| 82 |
|
| 83 |
@cached_property
|
| 84 |
def external_client(self) -> AsyncOpenAI:
|
|
|
|
| 119 |
|
| 120 |
class ConvergenceClient(OpenAIClient):
|
| 121 |
config: ConvergenceClientConfig
|
| 122 |
+
serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
|
| 123 |
_model_validated: bool = False
|
| 124 |
|
| 125 |
async def _validate_model(self) -> None:
|
| 126 |
try:
|
| 127 |
+
response = await self.external_client.models.list()
|
| 128 |
+
assert self.config.model_id in [model.id for model in response.data], (
|
| 129 |
+
f"Model {self.config.model_id} not found in {response.data}"
|
| 130 |
)
|
| 131 |
self._model_validated = True
|
| 132 |
logger.debug(f"Model {self.config.model_id} validated and connected to cluster")
|
src/proxy_lite/configs/default.yaml
CHANGED
|
@@ -7,6 +7,7 @@ environment:
|
|
| 7 |
include_poi_text: true
|
| 8 |
headless: false
|
| 9 |
homepage: https://www.google.co.uk
|
|
|
|
| 10 |
solver:
|
| 11 |
name: simple
|
| 12 |
agent:
|
|
@@ -17,4 +18,6 @@ solver:
|
|
| 17 |
api_base: https://convergence-ai-demo-api.hf.space/v1
|
| 18 |
local_view: true
|
| 19 |
task_timeout: 1800
|
|
|
|
|
|
|
| 20 |
verbose: true
|
|
|
|
| 7 |
include_poi_text: true
|
| 8 |
headless: false
|
| 9 |
homepage: https://www.google.co.uk
|
| 10 |
+
keep_original_image: true
|
| 11 |
solver:
|
| 12 |
name: simple
|
| 13 |
agent:
|
|
|
|
| 18 |
api_base: https://convergence-ai-demo-api.hf.space/v1
|
| 19 |
local_view: true
|
| 20 |
task_timeout: 1800
|
| 21 |
+
environment_timeout: 1800
|
| 22 |
+
action_timeout: 1800
|
| 23 |
verbose: true
|
src/proxy_lite/gif_maker.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import re
|
| 3 |
+
import textwrap
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
|
| 6 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 7 |
+
|
| 8 |
+
from proxy_lite.environments.environment_base import Action, Observation
|
| 9 |
+
from proxy_lite.recorder import Run
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def create_run_gif(
|
| 13 |
+
run: Run, output_path: str, white_panel_width: int = 300, duration: int = 1500, resize_factor: int = 4
|
| 14 |
+
) -> None:
|
| 15 |
+
"""
|
| 16 |
+
Generate a gif from the Run object's history.
|
| 17 |
+
|
| 18 |
+
For each Observation record, the observation image is decoded from its base64
|
| 19 |
+
encoded string. If the next record is an Action, its text is drawn onto a
|
| 20 |
+
white panel. The observation image and the white panel are then concatenated
|
| 21 |
+
horizontally to produce a frame.
|
| 22 |
+
|
| 23 |
+
Parameters:
|
| 24 |
+
run (Run): A Run object with its history containing Observation and Action records.
|
| 25 |
+
output_path (str): The path where the GIF will be saved.
|
| 26 |
+
white_panel_width (int): The width of the white panel for displaying text.
|
| 27 |
+
Default increased to 400 for larger images.
|
| 28 |
+
duration (int): Duration between frames in milliseconds.
|
| 29 |
+
Increased here to slow the FPS (default is 1000ms).
|
| 30 |
+
resize_factor (int): The factor to resize the image down by.
|
| 31 |
+
"""
|
| 32 |
+
frames = []
|
| 33 |
+
history = run.history
|
| 34 |
+
i = 0
|
| 35 |
+
while i < len(history):
|
| 36 |
+
if isinstance(history[i], Observation):
|
| 37 |
+
observation = history[i]
|
| 38 |
+
image_data = observation.state.image
|
| 39 |
+
if not image_data:
|
| 40 |
+
i += 1
|
| 41 |
+
continue
|
| 42 |
+
# Decode the base64 image
|
| 43 |
+
image_bytes = base64.b64decode(image_data)
|
| 44 |
+
obs_img = Image.open(BytesIO(image_bytes)).convert("RGB")
|
| 45 |
+
|
| 46 |
+
# scale the image down
|
| 47 |
+
obs_img = obs_img.resize((obs_img.width // resize_factor, obs_img.height // resize_factor))
|
| 48 |
+
|
| 49 |
+
# Check if the next record is an Action and extract its text if available
|
| 50 |
+
action_text = ""
|
| 51 |
+
if i + 1 < len(history) and isinstance(history[i + 1], Action):
|
| 52 |
+
action = history[i + 1]
|
| 53 |
+
if action.text:
|
| 54 |
+
action_text = action.text
|
| 55 |
+
|
| 56 |
+
# extract observation and thinking from tags in the action text
|
| 57 |
+
observation_match = re.search(r"<observation>(.*?)</observation>", action_text, re.DOTALL)
|
| 58 |
+
observation_content = observation_match.group(1).strip() if observation_match else None
|
| 59 |
+
|
| 60 |
+
# Extract text between thinking tags if present
|
| 61 |
+
thinking_match = re.search(r"<thinking>(.*?)</thinking>", action_text, re.DOTALL)
|
| 62 |
+
thinking_content = thinking_match.group(1).strip() if thinking_match else None
|
| 63 |
+
|
| 64 |
+
if observation_content and thinking_content:
|
| 65 |
+
action_text = f"**OBSERVATION**\n{observation_content}\n\n**THINKING**\n{thinking_content}"
|
| 66 |
+
|
| 67 |
+
# Create a white panel (same height as the observation image)
|
| 68 |
+
panel = Image.new("RGB", (white_panel_width, obs_img.height), "white")
|
| 69 |
+
draw = ImageDraw.Draw(panel)
|
| 70 |
+
font = ImageFont.load_default()
|
| 71 |
+
|
| 72 |
+
# Wrap the action text if it is too long
|
| 73 |
+
max_chars_per_line = 40 # Adjusted for larger font size
|
| 74 |
+
wrapped_text = textwrap.fill(action_text, width=max_chars_per_line)
|
| 75 |
+
|
| 76 |
+
# Calculate text block size and center it on the panel
|
| 77 |
+
try:
|
| 78 |
+
# Use multiline_textbbox if available (returns bounding box tuple)
|
| 79 |
+
bbox = draw.multiline_textbbox((0, 0), wrapped_text, font=font)
|
| 80 |
+
text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
|
| 81 |
+
except AttributeError:
|
| 82 |
+
# Fallback for older Pillow versions: compute size for each line
|
| 83 |
+
lines = wrapped_text.splitlines() or [wrapped_text]
|
| 84 |
+
line_sizes = [draw.textsize(line, font=font) for line in lines]
|
| 85 |
+
text_width = max(width for width, _ in line_sizes)
|
| 86 |
+
text_height = sum(height for _, height in line_sizes)
|
| 87 |
+
text_x = (white_panel_width - text_width) // 2
|
| 88 |
+
text_y = (obs_img.height - text_height) // 2
|
| 89 |
+
draw.multiline_text((text_x, text_y), wrapped_text, fill="black", font=font, align="center")
|
| 90 |
+
|
| 91 |
+
# Create the combined frame by concatenating the observation image and the panel
|
| 92 |
+
total_width = obs_img.width + white_panel_width
|
| 93 |
+
combined_frame = Image.new("RGB", (total_width, obs_img.height))
|
| 94 |
+
combined_frame.paste(obs_img, (0, 0))
|
| 95 |
+
combined_frame.paste(panel, (obs_img.width, 0))
|
| 96 |
+
frames.append(combined_frame)
|
| 97 |
+
|
| 98 |
+
# Skip the Action record since it has been processed with this Observation
|
| 99 |
+
if i + 1 < len(history) and isinstance(history[i + 1], Action):
|
| 100 |
+
i += 2
|
| 101 |
+
else:
|
| 102 |
+
i += 1
|
| 103 |
+
else:
|
| 104 |
+
i += 1
|
| 105 |
+
|
| 106 |
+
if frames:
|
| 107 |
+
frames[0].save(output_path, save_all=True, append_images=frames[1:], duration=duration, loop=0)
|
| 108 |
+
else:
|
| 109 |
+
raise ValueError("No frames were generated from the Run object's history.")
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# Example usage:
|
| 113 |
+
if __name__ == "__main__":
|
| 114 |
+
from proxy_lite.recorder import Run
|
| 115 |
+
|
| 116 |
+
dummy_run = Run.load("0abdb4cb-f289-48b0-ba13-35ed1210f7c1")
|
| 117 |
+
|
| 118 |
+
num_steps = int(len(dummy_run.history) / 2)
|
| 119 |
+
print(f"Number of steps: {num_steps}")
|
| 120 |
+
output_gif_path = "trajectory.gif"
|
| 121 |
+
create_run_gif(dummy_run, output_gif_path, duration=1000)
|
| 122 |
+
print(f"Trajectory GIF saved to {output_gif_path}")
|
src/proxy_lite/recorder.py
CHANGED
|
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
| 3 |
import datetime
|
| 4 |
import json
|
| 5 |
import os
|
| 6 |
-
import sys
|
| 7 |
import uuid
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Any, Optional, Self
|
|
@@ -39,6 +38,11 @@ class Run(BaseModel):
|
|
| 39 |
created_at=str(datetime.datetime.now(datetime.UTC)),
|
| 40 |
)
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
@property
|
| 43 |
def observations(self) -> list[Observation]:
|
| 44 |
return [h for h in self.history if isinstance(h, Observation)]
|
|
@@ -80,7 +84,7 @@ class DataRecorder:
|
|
| 80 |
self.local_folder = local_folder
|
| 81 |
|
| 82 |
def initialise_run(self, task: str) -> Run:
|
| 83 |
-
self.local_folder = Path(
|
| 84 |
os.makedirs(self.local_folder, exist_ok=True)
|
| 85 |
return Run.initialise(task)
|
| 86 |
|
|
|
|
| 3 |
import datetime
|
| 4 |
import json
|
| 5 |
import os
|
|
|
|
| 6 |
import uuid
|
| 7 |
from pathlib import Path
|
| 8 |
from typing import Any, Optional, Self
|
|
|
|
| 38 |
created_at=str(datetime.datetime.now(datetime.UTC)),
|
| 39 |
)
|
| 40 |
|
| 41 |
+
@classmethod
|
| 42 |
+
def load(cls, run_id: str) -> Self:
|
| 43 |
+
with open(Path(__file__).parent.parent.parent / "local_trajectories" / f"{run_id}.json", "r") as f:
|
| 44 |
+
return cls(**json.load(f))
|
| 45 |
+
|
| 46 |
@property
|
| 47 |
def observations(self) -> list[Observation]:
|
| 48 |
return [h for h in self.history if isinstance(h, Observation)]
|
|
|
|
| 84 |
self.local_folder = local_folder
|
| 85 |
|
| 86 |
def initialise_run(self, task: str) -> Run:
|
| 87 |
+
self.local_folder = Path(__file__).parent.parent.parent / "local_trajectories"
|
| 88 |
os.makedirs(self.local_folder, exist_ok=True)
|
| 89 |
return Run.initialise(task)
|
| 90 |
|
src/proxy_lite/serializer.py
CHANGED
|
@@ -25,7 +25,7 @@ class BaseSerializer(BaseModel, ABC):
|
|
| 25 |
def serialize_tools(self, tools: list[Tool]) -> list[dict]: ...
|
| 26 |
|
| 27 |
|
| 28 |
-
class
|
| 29 |
def serialize_messages(self, message_history: MessageHistory) -> list[dict]:
|
| 30 |
return message_history.to_dict(exclude={"label"})
|
| 31 |
|
|
|
|
| 25 |
def serialize_tools(self, tools: list[Tool]) -> list[dict]: ...
|
| 26 |
|
| 27 |
|
| 28 |
+
class OpenAICompatibleSerializer(BaseSerializer):
|
| 29 |
def serialize_messages(self, message_history: MessageHistory) -> list[dict]:
|
| 30 |
return message_history.to_dict(exclude={"label"})
|
| 31 |
|