Spaces:

Speedofmastery
/

HMM

Sleeping

HMM

File size: 51,593 Bytes

d7b3d84

"""Code-use agent service - Jupyter notebook-like code execution for browser automation."""

import asyncio
import datetime
import logging
import re
import traceback
from pathlib import Path
from typing import Any

from uuid_extensions import uuid7str

from browser_use.browser import BrowserSession
from browser_use.browser.profile import BrowserProfile
from browser_use.dom.service import DomService
from browser_use.filesystem.file_system import FileSystem
from browser_use.llm.base import BaseChatModel
from browser_use.llm.messages import (
	AssistantMessage,
	BaseMessage,
	ContentPartImageParam,
	ContentPartTextParam,
	ImageURL,
	UserMessage,
)
from browser_use.screenshots.service import ScreenshotService
from browser_use.telemetry.service import ProductTelemetry
from browser_use.telemetry.views import AgentTelemetryEvent
from browser_use.tokens.service import TokenCost
from browser_use.tokens.views import UsageSummary
from browser_use.tools.service import CodeAgentTools, Tools
from browser_use.utils import get_browser_use_version

from .formatting import format_browser_state_for_llm
from .namespace import EvaluateError, create_namespace
from .utils import detect_token_limit_issue, extract_code_blocks, extract_url_from_task, truncate_message_content
from .views import (
	CodeAgentHistory,
	CodeAgentModelOutput,
	CodeAgentResult,
	CodeAgentState,
	CodeAgentStepMetadata,
	ExecutionStatus,
	NotebookSession,
)

logger = logging.getLogger(__name__)


class CodeAgent:
	"""
	Agent that executes Python code in a notebook-like environment for browser automation.

	This agent provides a Jupyter notebook-like interface where the LLM writes Python code
	that gets executed in a persistent namespace with browser control functions available.
	"""

	def __init__(
		self,
		task: str,
		# Optional parameters
		llm: BaseChatModel | None = None,
		browser_session: BrowserSession | None = None,
		browser: BrowserSession | None = None,  # Alias for browser_session
		tools: Tools | None = None,
		controller: Tools | None = None,  # Alias for tools
		# Agent settings
		page_extraction_llm: BaseChatModel | None = None,
		file_system: FileSystem | None = None,
		available_file_paths: list[str] | None = None,
		sensitive_data: dict[str, str | dict[str, str]] | None = None,
		max_steps: int = 100,
		max_failures: int = 8,
		max_validations: int = 0,
		use_vision: bool = True,
		calculate_cost: bool = False,
		**kwargs,
	):
		"""
		Initialize the code-use agent.

		Args:
			task: The task description for the agent
			browser_session: Optional browser session (will be created if not provided) [DEPRECATED: use browser]
			browser: Optional browser session (cleaner API)
			tools: Optional Tools instance (will create default if not provided)
			controller: Optional Tools instance
			page_extraction_llm: Optional LLM for page extraction
			file_system: Optional file system for file operations
			available_file_paths: Optional list of available file paths
			sensitive_data: Optional sensitive data dictionary
			max_steps: Maximum number of execution steps
			max_failures: Maximum consecutive errors before termination (default: 8)
			max_validations: Maximum number of times to run the validator agent (default: 0)
			use_vision: Whether to include screenshots in LLM messages (default: True)
			calculate_cost: Whether to calculate token costs (default: False)
			llm: Optional ChatBrowserUse LLM instance (will create default if not provided)
			**kwargs: Additional keyword arguments for compatibility (ignored)
		"""
		# Log and ignore unknown kwargs for compatibility
		if kwargs:
			logger.debug(f'Ignoring additional kwargs for CodeAgent compatibility: {list(kwargs.keys())}')

		if llm is None:
			try:
				from browser_use import ChatBrowserUse

				llm = ChatBrowserUse()
				logger.debug('CodeAgent using ChatBrowserUse')
			except Exception as e:
				raise RuntimeError(f'Failed to initialize CodeAgent LLM: {e}')

		if 'ChatBrowserUse' not in llm.__class__.__name__:
			raise ValueError('This agent works only with ChatBrowserUse.')

		# Handle browser vs browser_session parameter (browser takes precedence)
		if browser and browser_session:
			raise ValueError('Cannot specify both "browser" and "browser_session" parameters. Use "browser" for the cleaner API.')
		browser_session = browser or browser_session

		# Handle controller vs tools parameter (controller takes precedence)
		if controller and tools:
			raise ValueError('Cannot specify both "controller" and "tools" parameters. Use "controller" for the cleaner API.')
		tools = controller or tools

		# Store browser_profile for creating browser session if needed
		self._browser_profile_for_init = BrowserProfile() if browser_session is None else None

		self.task = task
		self.llm = llm
		self.browser_session = browser_session
		self.tools = tools or CodeAgentTools()
		self.page_extraction_llm = page_extraction_llm
		self.file_system = file_system if file_system is not None else FileSystem(base_dir='./')
		self.available_file_paths = available_file_paths or []
		self.sensitive_data = sensitive_data
		self.max_steps = max_steps
		self.max_failures = max_failures
		self.max_validations = max_validations
		self.use_vision = use_vision

		self.session = NotebookSession()
		self.namespace: dict[str, Any] = {}
		self._llm_messages: list[BaseMessage] = []  # Internal LLM conversation history
		self.complete_history: list[CodeAgentHistory] = []  # Type-safe history with model_output and result
		self.dom_service: DomService | None = None
		self._last_browser_state_text: str | None = None  # Track last browser state text
		self._last_screenshot: str | None = None  # Track last screenshot (base64)
		self._consecutive_errors = 0  # Track consecutive errors for auto-termination
		self._validation_count = 0  # Track number of validator runs
		self._last_llm_usage: Any | None = None  # Track last LLM call usage stats
		self._step_start_time = 0.0  # Track step start time for duration calculation
		self.usage_summary: UsageSummary | None = None  # Track usage summary across run for history property

		# Initialize screenshot service for eval tracking
		self.id = uuid7str()
		timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
		base_tmp = Path('/tmp')
		self.agent_directory = base_tmp / f'browser_use_code_agent_{self.id}_{timestamp}'
		self.screenshot_service = ScreenshotService(agent_directory=self.agent_directory)

		# Initialize token cost service for usage tracking
		self.token_cost_service = TokenCost(include_cost=calculate_cost)
		self.token_cost_service.register_llm(llm)
		if page_extraction_llm:
			self.token_cost_service.register_llm(page_extraction_llm)

		# Set version and source for telemetry
		self.version = get_browser_use_version()
		try:
			package_root = Path(__file__).parent.parent.parent
			repo_files = ['.git', 'README.md', 'docs', 'examples']
			if all(Path(package_root / file).exists() for file in repo_files):
				self.source = 'git'
			else:
				self.source = 'pip'
		except Exception:
			self.source = 'unknown'

		# Telemetry
		self.telemetry = ProductTelemetry()

	async def run(self, max_steps: int | None = None) -> NotebookSession:
		"""
		Run the agent to complete the task.

		Args:
			max_steps: Optional override for maximum number of steps (uses __init__ value if not provided)

		Returns:
			The notebook session with all executed cells
		"""
		# Use override if provided, otherwise use value from __init__
		steps_to_run = max_steps if max_steps is not None else self.max_steps
		self.max_steps = steps_to_run
		# Start browser if not provided
		if self.browser_session is None:
			assert self._browser_profile_for_init is not None
			self.browser_session = BrowserSession(browser_profile=self._browser_profile_for_init)
			await self.browser_session.start()

		# Initialize DOM service with cross-origin iframe support enabled
		self.dom_service = DomService(
			browser_session=self.browser_session,
			cross_origin_iframes=True,  # Enable for code-use agent to access forms in iframes
		)

		# Create namespace with all tools
		self.namespace = create_namespace(
			browser_session=self.browser_session,
			tools=self.tools,
			page_extraction_llm=self.page_extraction_llm,
			file_system=self.file_system,
			available_file_paths=self.available_file_paths,
			sensitive_data=self.sensitive_data,
		)

		# Initialize conversation with task
		self._llm_messages.append(UserMessage(content=f'Task: {self.task}'))

		# Track agent run error for telemetry
		agent_run_error: str | None = None

		# Extract URL from task and navigate if found
		initial_url = extract_url_from_task(self.task)
		if initial_url:
			try:
				logger.info(f'Extracted URL from task, navigating to: {initial_url}')
				# Use the navigate action from namespace
				await self.namespace['navigate'](initial_url)
				# Wait for page load
				await asyncio.sleep(2)

				# Record this navigation as a cell in the notebook
				nav_code = f"await navigate('{initial_url}')"
				cell = self.session.add_cell(source=nav_code)
				cell.status = ExecutionStatus.SUCCESS
				cell.execution_count = self.session.increment_execution_count()
				cell.output = f'Navigated to {initial_url}'

				# Get browser state after navigation for the cell
				if self.dom_service:
					try:
						browser_state_text, _ = await self._get_browser_state()
						cell.browser_state = browser_state_text
					except Exception as state_error:
						logger.debug(f'Failed to capture browser state for initial navigation cell: {state_error}')

			except Exception as e:
				logger.warning(f'Failed to navigate to extracted URL {initial_url}: {e}')
				# Record failed navigation as error cell
				nav_code = f"await navigate('{initial_url}')"
				cell = self.session.add_cell(source=nav_code)
				cell.status = ExecutionStatus.ERROR
				cell.execution_count = self.session.increment_execution_count()
				cell.error = str(e)

		# Get initial browser state before first LLM call
		if self.browser_session and self.dom_service:
			try:
				browser_state_text, screenshot = await self._get_browser_state()
				self._last_browser_state_text = browser_state_text
				self._last_screenshot = screenshot
			except Exception as e:
				logger.warning(f'Failed to get initial browser state: {e}')

		# Main execution loop
		for step in range(self.max_steps):
			logger.info(f'\n\n\n\n\n\n\nStep {step + 1}/{self.max_steps}')

			# Start timing this step
			self._step_start_time = datetime.datetime.now().timestamp()

			# Check if we're approaching the step limit or error limit and inject warning
			steps_remaining = self.max_steps - step - 1
			errors_remaining = self.max_failures - self._consecutive_errors

			should_warn = (
				steps_remaining <= 1  # Last step or next to last
				or errors_remaining <= 1  # One more error will terminate
				or (steps_remaining <= 2 and self._consecutive_errors >= 2)  # Close to both limits
			)

			if should_warn:
				warning_message = (
					f'\n\n⚠️ CRITICAL WARNING: You are approaching execution limits!\n'
					f'- Steps remaining: {steps_remaining + 1}\n'
					f'- Consecutive errors: {self._consecutive_errors}/{self.max_failures}\n\n'
					f'YOU MUST call done() in your NEXT response, even if the task is incomplete:\n'
					f"- Set success=False if you couldn't complete the task\n"
					f'- Return EVERYTHING you found so far (partial data is better than nothing)\n'
					f"- Include any variables you've stored (products, all_data, etc.)\n"
					f"- Explain what worked and what didn't\n\n"
					f'Without done(), the user will receive NOTHING.'
				)
				self._llm_messages.append(UserMessage(content=warning_message))

			try:
				# Fetch fresh browser state right before LLM call (only if not already set)
				if not self._last_browser_state_text and self.browser_session and self.dom_service:
					try:
						logger.debug('🔍 Fetching browser state before LLM call...')
						browser_state_text, screenshot = await self._get_browser_state()
						self._last_browser_state_text = browser_state_text
						self._last_screenshot = screenshot

						# # Log browser state
						# if len(browser_state_text) > 2000:
						# 	logger.info(
						# 		f'Browser state (before LLM):\n{browser_state_text[:2000]}...\n[Truncated, full state {len(browser_state_text)} chars sent to LLM]'
						# 	)
						# else:
						# 	logger.info(f'Browser state (before LLM):\n{browser_state_text}')
					except Exception as e:
						logger.warning(f'Failed to get browser state before LLM call: {e}')

				# Get code from LLM (this also adds to self._llm_messages)
				try:
					code, full_llm_response = await self._get_code_from_llm()
				except Exception as llm_error:
					# LLM call failed - count as consecutive error and retry
					self._consecutive_errors += 1
					logger.warning(
						f'LLM call failed (consecutive errors: {self._consecutive_errors}/{self.max_failures}), retrying: {llm_error}'
					)

					# Check if we've hit the consecutive error limit
					if self._consecutive_errors >= self.max_failures:
						logger.error(f'Terminating: {self.max_failures} consecutive LLM failures')
						break

					await asyncio.sleep(1)  # Brief pause before retry
					continue

				if not code or code.strip() == '':
					# If task is already done, empty code is fine (LLM explaining completion)
					if self._is_task_done():
						logger.info('Task already marked as done, LLM provided explanation without code')
						# Add the text response to history as a non-code step
						await self._add_step_to_complete_history(
							model_output_code='',
							full_llm_response=full_llm_response,
							output=full_llm_response,  # Treat the explanation as output
							error=None,
							screenshot_path=await self._capture_screenshot(step + 1),
						)
						break  # Exit the loop since task is done

					logger.warning('LLM returned empty code')
					self._consecutive_errors += 1

					# new state
					if self.browser_session and self.dom_service:
						try:
							browser_state_text, screenshot = await self._get_browser_state()
							self._last_browser_state_text = browser_state_text
							self._last_screenshot = screenshot
						except Exception as e:
							logger.warning(f'Failed to get new browser state: {e}')
					continue

				# Execute code blocks sequentially if multiple python blocks exist
				# This allows JS/bash blocks to be injected into namespace before Python code uses them
				all_blocks = self.namespace.get('_all_code_blocks', {})
				python_blocks = [k for k in sorted(all_blocks.keys()) if k.startswith('python_')]

				if len(python_blocks) > 1:
					# Multiple Python blocks - execute each sequentially
					output = None
					error = None

					for i, block_key in enumerate(python_blocks):
						logger.info(f'Executing Python block {i + 1}/{len(python_blocks)}')
						block_code = all_blocks[block_key]
						block_output, block_error, _ = await self._execute_code(block_code)

						# Accumulate outputs
						if block_output:
							output = (output or '') + block_output
						if block_error:
							error = block_error
							# Stop on first error
							break
				else:
					# Single Python block - execute normally
					output, error, _ = await self._execute_code(code)

				# Track consecutive errors
				if error:
					self._consecutive_errors += 1
					logger.warning(f'Consecutive errors: {self._consecutive_errors}/{self.max_failures}')

					# Check if we've hit the consecutive error limit
					if self._consecutive_errors >= self.max_failures:
						logger.error(
							f'Terminating: {self.max_failures} consecutive errors reached. The agent is unable to make progress.'
						)
						# Add termination message to complete history before breaking
						await self._add_step_to_complete_history(
							model_output_code=code,
							full_llm_response=f'[Terminated after {self.max_failures} consecutive errors]',
							output=None,
							error=f'Auto-terminated: {self.max_failures} consecutive errors without progress',
							screenshot_path=None,
						)
						break
				else:
					# Reset consecutive error counter on success
					self._consecutive_errors = 0

				# Check if task is done - validate completion first if not at limits
				if self._is_task_done():
					# Get the final result from namespace (from done() call)
					final_result: str | None = self.namespace.get('_task_result')  # type: ignore[assignment]

					# Check if we should validate (not at step/error limits and under max validations)
					steps_remaining = self.max_steps - step - 1
					should_validate = (
						self._validation_count < self.max_validations  # Haven't exceeded max validations
						and steps_remaining >= 4  # At least 4 steps away from limit
						and self._consecutive_errors < 3  # Not close to error limit (8 consecutive)
					)

					if should_validate:
						self._validation_count += 1
						logger.info('Validating task completion with LLM...')
						from .namespace import validate_task_completion

						is_complete, reasoning = await validate_task_completion(
							task=self.task,
							output=final_result,
							llm=self.llm,
						)

						if not is_complete:
							# Task not truly complete - inject feedback and continue
							logger.warning('Validator: Task not complete, continuing...')
							validation_feedback = (
								f'\n\n⚠️ VALIDATOR FEEDBACK:\n'
								f'Your done() call was rejected. The task is NOT complete yet.\n\n'
								f'Validation reasoning:\n{reasoning}\n\n'
								f'You must continue working on the task. Analyze what is missing and complete it.\n'
								f'Do NOT call done() again until the task is truly finished.'
							)

							# Clear the done flag so execution continues
							self.namespace['_task_done'] = False
							self.namespace.pop('_task_result', None)
							self.namespace.pop('_task_success', None)

							# Add validation feedback to LLM messages
							self._llm_messages.append(UserMessage(content=validation_feedback))

							# Don't override output - let execution continue normally
						else:
							logger.info('Validator: Task complete')
							# Override output with done message for final step
							if final_result:
								output = final_result
					else:
						# At limits - skip validation and accept done()
						if self._validation_count >= self.max_validations:
							logger.info(
								f'Reached max validations ({self.max_validations}) - skipping validation and accepting done()'
							)
						else:
							logger.info('At step/error limits - skipping validation')
						if final_result:
							output = final_result

				if output:
					# Check if this is the final done() output
					if self._is_task_done():
						# Show done() output more prominently
						logger.info(
							f'✓ Task completed - Final output from done():\n{output[:300] if len(output) > 300 else output}'
						)
						# Also show files_to_display if they exist in namespace
						attachments: list[str] | None = self.namespace.get('_task_attachments')  # type: ignore[assignment]
						if attachments:
							logger.info(f'Files displayed: {", ".join(attachments)}')
					else:
						logger.info(f'Code output:\n{output}')

				# Browser state is now only logged when fetched before LLM call (not after execution)

				# Take screenshot for eval tracking
				screenshot_path = await self._capture_screenshot(step + 1)

				# Add step to complete_history for eval system
				await self._add_step_to_complete_history(
					model_output_code=code,
					full_llm_response=full_llm_response,
					output=output,
					error=error,
					screenshot_path=screenshot_path,
				)

				# Check if task is done (after validation)
				if self._is_task_done():
					# Get the final result from namespace
					final_result: str | None = self.namespace.get('_task_result', output)  # type: ignore[assignment]
					logger.info('Task completed successfully')
					if final_result:
						logger.info(f'Final result: {final_result}')
					break
				# If validation rejected done(), continue to next iteration
				# The feedback message has already been added to _llm_messages

				# Add result to LLM messages for next iteration (without browser state)
				result_message = self._format_execution_result(code, output, error, current_step=step + 1)
				truncated_result = truncate_message_content(result_message)
				self._llm_messages.append(UserMessage(content=truncated_result))

			except Exception as e:
				logger.error(f'Error in step {step + 1}: {e}')
				traceback.print_exc()
				break
		else:
			# Loop completed without break - max_steps reached
			logger.warning(f'Maximum steps ({self.max_steps}) reached without task completion')

		# If task is not done, capture the last step's output as partial result
		if not self._is_task_done() and self.complete_history:
			# Get the last step's output/error and use it as final extracted_content
			last_step = self.complete_history[-1]
			last_result = last_step.result[0] if last_step.result else None
			last_output = last_result.extracted_content if last_result else None
			last_error = last_result.error if last_result else None

			# Build a partial result message from the last step
			partial_result_parts = []
			partial_result_parts.append(f'Task incomplete - reached step limit ({self.max_steps} steps).')
			partial_result_parts.append('Last step output:')

			if last_output:
				partial_result_parts.append(f'\nOutput: {last_output}')
			if last_error:
				partial_result_parts.append(f'\nError: {last_error}')

			# Add any accumulated variables that might contain useful data
			data_vars = []
			for var_name in sorted(self.namespace.keys()):
				if not var_name.startswith('_') and var_name not in {'json', 'asyncio', 'csv', 're', 'datetime', 'Path'}:
					var_value = self.namespace[var_name]
					# Check if it's a list or dict that might contain collected data
					if isinstance(var_value, (list, dict)) and var_value:
						data_vars.append(f'  - {var_name}: {type(var_value).__name__} with {len(var_value)} items')

			if data_vars:
				partial_result_parts.append('\nVariables in namespace that may contain partial data:')
				partial_result_parts.extend(data_vars)

			partial_result = '\n'.join(partial_result_parts)

			# Update the last step's extracted_content with this partial result
			if last_result:
				last_result.extracted_content = partial_result
				last_result.is_done = False
				last_result.success = False

			logger.info(f'\nPartial result captured from last step:\n{partial_result}')

		# Log final summary if task was completed
		if self._is_task_done():
			logger.info('\n' + '=' * 60)
			logger.info('TASK COMPLETED SUCCESSFULLY')
			logger.info('=' * 60)
			final_result: str | None = self.namespace.get('_task_result')  # type: ignore[assignment]
			if final_result:
				logger.info(f'\nFinal Output:\n{final_result}')

			attachments: list[str] | None = self.namespace.get('_task_attachments')  # type: ignore[assignment]
			if attachments:
				logger.info(f'\nFiles Attached:\n{chr(10).join(attachments)}')
			logger.info('=' * 60 + '\n')

		# Auto-close browser if keep_alive is False
		await self.close()

		# Store usage summary for history property
		self.usage_summary = await self.token_cost_service.get_usage_summary()

		# Log token usage summary
		await self.token_cost_service.log_usage_summary()

		# Log telemetry event
		try:
			self._log_agent_event(max_steps=self.max_steps, agent_run_error=agent_run_error)
		except Exception as log_e:
			logger.error(f'Failed to log telemetry event: {log_e}', exc_info=True)

		return self.session

	async def _get_code_from_llm(self) -> tuple[str, str]:
		"""Get Python code from the LLM.

		Returns:
			Tuple of (extracted_code, full_llm_response)
		"""
		# Prepare messages for this request
		# Include browser state as separate message if available (not accumulated in history)
		messages_to_send = self._llm_messages.copy()

		if self._last_browser_state_text:
			# Create message with optional screenshot
			if self.use_vision and self._last_screenshot:
				# Build content with text + screenshot
				content_parts: list[ContentPartTextParam | ContentPartImageParam] = [
					ContentPartTextParam(text=self._last_browser_state_text)
				]

				# Add screenshot
				content_parts.append(
					ContentPartImageParam(
						image_url=ImageURL(
							url=f'data:image/jpeg;base64,{self._last_screenshot}',
							media_type='image/jpeg',
							detail='auto',
						),
					)
				)

				messages_to_send.append(UserMessage(content=content_parts))
			else:
				# Text only
				messages_to_send.append(UserMessage(content=self._last_browser_state_text))

			# Clear browser state after including it so it's only in this request
			self._last_browser_state_text = None
			self._last_screenshot = None

		# Call LLM with message history (including temporary browser state message)
		response = await self.llm.ainvoke(messages_to_send)

		# Store usage stats from this LLM call
		self._last_llm_usage = response.usage

		# Log the LLM's raw output for debugging
		logger.info(f'LLM Response:\n{response.completion}')

		# Check for token limit or repetition issues
		max_tokens = getattr(self.llm, 'max_tokens', None)
		completion_tokens = response.usage.completion_tokens if response.usage else None
		is_problematic, issue_message = detect_token_limit_issue(
			completion=response.completion,
			completion_tokens=completion_tokens,
			max_tokens=max_tokens,
			stop_reason=response.stop_reason,
		)

		if is_problematic:
			logger.warning(f'Token limit issue detected: {issue_message}')
			# Don't add the bad response to history
			# Instead, inject a system message prompting recovery
			recovery_prompt = (
				f'Your previous response hit a token limit or became repetitive: {issue_message}\n\n'
				'Please write a SHORT plan (2 sentences) for what to do next, then execute ONE simple action.'
			)
			self._llm_messages.append(UserMessage(content=recovery_prompt))
			# Return a controlled error message instead of corrupted code
			return '', f'[Token limit error: {issue_message}]'

		# Store the full response
		full_response = response.completion

		# Extract code blocks from response
		# Support multiple code block types: python, js, bash, markdown
		code_blocks = extract_code_blocks(response.completion)

		# Inject non-python blocks into namespace as variables
		# Track which variables are code blocks for browser state display
		if '_code_block_vars' not in self.namespace:
			self.namespace['_code_block_vars'] = set()

		for block_type, block_content in code_blocks.items():
			if not block_type.startswith('python'):
				# Store js, bash, markdown blocks (and named variants) as variables in namespace
				self.namespace[block_type] = block_content
				self.namespace['_code_block_vars'].add(block_type)
				print(f'→ Code block variable: {block_type} (str, {len(block_content)} chars)')
				logger.debug(f'Injected {block_type} block into namespace ({len(block_content)} chars)')

		# Store all code blocks for sequential execution
		self.namespace['_all_code_blocks'] = code_blocks

		# Get Python code if it exists
		# If no python block exists and no other code blocks exist, return empty string to skip execution
		# This prevents treating plain text explanations as code
		code = code_blocks.get('python', response.completion)

		# Add to LLM messages (truncate for history to save context)
		truncated_completion = truncate_message_content(response.completion)
		self._llm_messages.append(AssistantMessage(content=truncated_completion))

		return code, full_response

	def _print_variable_info(self, var_name: str, value: Any) -> None:
		"""Print compact info about a variable assignment."""
		# Skip built-in modules and known imports
		skip_names = {
			'json',
			'asyncio',
			'csv',
			're',
			'datetime',
			'Path',
			'pd',
			'np',
			'plt',
			'requests',
			'BeautifulSoup',
			'PdfReader',
			'browser',
			'file_system',
		}
		if var_name in skip_names:
			return

		# Skip code block variables (already printed)
		if '_code_block_vars' in self.namespace and var_name in self.namespace.get('_code_block_vars', set()):
			return

		# Print compact variable info
		if isinstance(value, (list, dict)):
			preview = str(value)[:100]
			print(f'→ Variable: {var_name} ({type(value).__name__}, len={len(value)}, preview={preview}...)')
		elif isinstance(value, str) and len(value) > 50:
			print(f'→ Variable: {var_name} (str, {len(value)} chars, preview={value[:50]}...)')
		elif callable(value):
			print(f'→ Variable: {var_name} (function)')
		else:
			print(f'→ Variable: {var_name} ({type(value).__name__}, value={repr(value)[:50]})')

	async def _execute_code(self, code: str) -> tuple[str | None, str | None, str | None]:
		"""
		Execute Python code in the namespace.

		Args:
			code: The Python code to execute

		Returns:
			Tuple of (output, error, browser_state)
		"""
		# Create new cell
		cell = self.session.add_cell(source=code)
		cell.status = ExecutionStatus.RUNNING
		cell.execution_count = self.session.increment_execution_count()

		output = None
		error = None
		browser_state = None

		try:
			# Capture output
			import ast
			import io
			import sys

			old_stdout = sys.stdout
			sys.stdout = io.StringIO()

			try:
				# Add asyncio to namespace if not already there
				if 'asyncio' not in self.namespace:
					self.namespace['asyncio'] = asyncio

				# Store the current code in namespace for done() validation
				self.namespace['_current_cell_code'] = code
				# Store consecutive errors count for done() validation
				self.namespace['_consecutive_errors'] = self._consecutive_errors

				# Check if code contains await expressions - if so, wrap in async function
				# This mimics how Jupyter/IPython handles top-level await
				try:
					tree = ast.parse(code, mode='exec')
					has_await = any(isinstance(node, (ast.Await, ast.AsyncWith, ast.AsyncFor)) for node in ast.walk(tree))
				except SyntaxError:
					# If parse fails, let exec handle the error
					has_await = False

				if has_await:
					# When code has await, we must wrap in async function
					# To make variables persist naturally (like Jupyter without needing 'global'):
					# 1. Extract all assigned variable names from the code
					# 2. Inject 'global' declarations for variables that already exist in namespace
					# 3. Extract user's explicit global declarations and pre-define those vars
					# 4. Return locals() so we can update namespace with new variables

					# Find all variable names being assigned + user's explicit globals
					try:
						assigned_names = set()
						user_global_names = set()

						for node in ast.walk(tree):
							if isinstance(node, ast.Assign):
								for target in node.targets:
									if isinstance(target, ast.Name):
										assigned_names.add(target.id)
							elif isinstance(node, ast.AugAssign) and isinstance(node.target, ast.Name):
								assigned_names.add(node.target.id)
							elif isinstance(node, (ast.AnnAssign, ast.NamedExpr)):
								if hasattr(node, 'target') and isinstance(node.target, ast.Name):
									assigned_names.add(node.target.id)
							elif isinstance(node, ast.Global):
								# Track user's explicit global declarations
								user_global_names.update(node.names)

						# Pre-define any user-declared globals that don't exist yet
						# This prevents NameError when user writes "global foo" before "foo = ..."
						for name in user_global_names:
							if name not in self.namespace:
								self.namespace[name] = None

						# Filter to only existing namespace vars (like Jupyter does)
						# Include both: assigned vars that exist + user's explicit globals
						existing_vars = {name for name in (assigned_names | user_global_names) if name in self.namespace}
					except Exception as e:
						existing_vars = set()

					# Build global declaration if needed
					global_decl = ''
					has_global_decl = False
					if existing_vars:
						vars_str = ', '.join(sorted(existing_vars))
						global_decl = f'    global {vars_str}\n'
						has_global_decl = True

					indented_code = '\n'.join('    ' + line if line.strip() else line for line in code.split('\n'))
					wrapped_code = f"""async def __code_exec__():
{global_decl}{indented_code}
    # Return locals so we can update the namespace
    return locals()

__code_exec_coro__ = __code_exec__()
"""
					# Store whether we added a global declaration (needed for error line mapping)
					self.namespace['_has_global_decl'] = has_global_decl

					# Compile and execute wrapper at module level
					compiled_code = compile(wrapped_code, '<code>', 'exec')
					exec(compiled_code, self.namespace, self.namespace)

					# Get and await the coroutine, then update namespace with new/modified variables
					coro = self.namespace.get('__code_exec_coro__')
					if coro:
						result_locals = await coro
						# Update namespace with all variables from the function's locals
						# This makes variable assignments persist across cells
						if result_locals:
							for key, value in result_locals.items():
								if not key.startswith('_'):
									self.namespace[key] = value
									# Variable info is tracked in "Available" section, no need for verbose inline output

						# Clean up temporary variables
						self.namespace.pop('__code_exec_coro__', None)
						self.namespace.pop('__code_exec__', None)
				else:
					# No await - execute directly at module level for natural variable scoping
					# This means x = x + 10 will work without needing 'global x'

					# Track variables before execution
					vars_before = set(self.namespace.keys())

					compiled_code = compile(code, '<code>', 'exec')
					exec(compiled_code, self.namespace, self.namespace)

					# Track newly created/modified variables (info shown in "Available" section)
					vars_after = set(self.namespace.keys())
					new_vars = vars_after - vars_before

				# Get output
				output_value = sys.stdout.getvalue()
				if output_value:
					output = output_value

			finally:
				sys.stdout = old_stdout

			# Wait 2 seconds for page to stabilize after code execution
			await asyncio.sleep(0.5)

			# Note: Browser state is now fetched right before LLM call instead of after each execution
			# This reduces unnecessary state fetches for operations that don't affect the browser

			cell.status = ExecutionStatus.SUCCESS
			cell.output = output
			cell.browser_state = None  # Will be captured in next iteration before LLM call

		except Exception as e:
			# Handle EvaluateError specially - JavaScript execution failed
			if isinstance(e, EvaluateError):
				error = str(e)
				cell.status = ExecutionStatus.ERROR
				cell.error = error
				logger.error(f'Code execution error: {error}')

				await asyncio.sleep(1)

				# Browser state will be fetched before next LLM call
				# Return immediately - do not continue executing code
				return output, error, None

			# Handle NameError specially - check for code block variable confusion
			if isinstance(e, NameError):
				error_msg = str(e)
				cell.status = ExecutionStatus.ERROR
				cell.error = error

				# Browser state will be fetched before next LLM call
				await asyncio.sleep(0.5)
				return output, error, None

			# For syntax errors and common parsing errors, show just the error message
			# without the full traceback to keep output clean
			if isinstance(e, SyntaxError):
				error_msg = e.msg if e.msg else str(e)
				error = f'{type(e).__name__}: {error_msg}'

				# Detect common f-string issues with JSON/JavaScript code
				if 'unterminated' in error_msg.lower() and 'string' in error_msg.lower() and code:
					# Check if code contains f-strings with potential JSON/JS content
					has_fstring = bool(re.search(r'\bf["\']', code))
					has_json_pattern = bool(re.search(r'json\.dumps|"[^"]*\{[^"]*\}[^"]*"|\'[^\']*\{[^\']*\}[^\']*\'', code))
					has_js_pattern = bool(re.search(r'evaluate\(|await evaluate', code))

					if has_fstring and (has_json_pattern or has_js_pattern):
						error += (
							'\n\n💡 TIP: Detected f-string with JSON/JavaScript code containing {}.\n'
							'   Use separate ```js or ```markdown blocks instead of f-strings to avoid escaping issues.\n'
							'   If your code block needs ``` inside it, wrap with 4+ backticks: ````markdown code`\n'
						)

				# Detect and provide helpful hints for common string literal errors
				if 'unterminated' in error_msg.lower() and 'string' in error_msg.lower():
					# Detect what type of string literal is unterminated
					is_triple = 'triple-quoted' in error_msg.lower()
					msg_lower = error_msg.lower()

					# Detect prefix type from error message
					if 'f-string' in msg_lower and 'raw' in msg_lower:
						prefix = 'rf or fr'
						desc = 'raw f-string'
					elif 'f-string' in msg_lower:
						prefix = 'f'
						desc = 'f-string'
					elif 'raw' in msg_lower and 'bytes' in msg_lower:
						prefix = 'rb or br'
						desc = 'raw bytes'
					elif 'raw' in msg_lower:
						prefix = 'r'
						desc = 'raw string'
					elif 'bytes' in msg_lower:
						prefix = 'b'
						desc = 'bytes'
					else:
						prefix = ''
						desc = 'string'

					# Build hint based on triple-quoted vs single/double quoted
					if is_triple:
						if prefix:
							hint = f"Hint: Unterminated {prefix}'''...''' or {prefix}\"\"\"...\"\" ({desc}). Check for missing closing quotes or unescaped quotes inside."
						else:
							hint = "Hint: Unterminated '''...''' or \"\"\"...\"\" detected. Check for missing closing quotes or unescaped quotes inside."
						hint += '\n      If you need ``` inside your string, use a ````markdown varname` code block with 4+ backticks instead.'
					else:
						if prefix:
							hint = f'Hint: Unterminated {prefix}\'...\' or {prefix}"..." ({desc}). Check for missing closing quote or unescaped quotes inside.'
						else:
							hint = 'Hint: Unterminated \'...\' or "..." detected. Check for missing closing quote or unescaped quotes inside the string.'
					error += f'\n{hint}'

				# Show the problematic line from the code
				if e.text:
					error += f'\n{e.text}'
				elif e.lineno and code:
					# If e.text is empty, extract the line from the code
					lines = code.split('\n')
					if 0 < e.lineno <= len(lines):
						error += f'\n{lines[e.lineno - 1]}'

			else:
				# For other errors, try to extract useful information
				error_str = str(e)
				error = f'{type(e).__name__}: {error_str}' if error_str else f'{type(e).__name__} occurred'

				# For RuntimeError or other exceptions, try to extract traceback info
				# to show which line in the user's code actually failed
				if hasattr(e, '__traceback__'):
					# Walk the traceback to find the frame with '<code>' filename
					tb = e.__traceback__
					user_code_lineno = None
					while tb is not None:
						frame = tb.tb_frame
						if frame.f_code.co_filename == '<code>':
							# Found the frame executing user code
							# Get the line number from the traceback
							user_code_lineno = tb.tb_lineno
							break
						tb = tb.tb_next

			cell.status = ExecutionStatus.ERROR
			cell.error = error
			logger.error(f'Code execution error: {error}')

			await asyncio.sleep(1)

			# Browser state will be fetched before next LLM call

		return output, error, None

	async def _get_browser_state(self) -> tuple[str, str | None]:
		"""Get the current browser state as text with ultra-minimal DOM structure for code agents.

		Returns:
			Tuple of (browser_state_text, screenshot_base64)
		"""
		if not self.browser_session or not self.dom_service:
			return 'Browser state not available', None

		try:
			# Get full browser state including screenshot if use_vision is enabled
			include_screenshot = True
			state = await self.browser_session.get_browser_state_summary(include_screenshot=include_screenshot)

			# Format browser state with namespace context
			browser_state_text = await format_browser_state_for_llm(
				state=state, namespace=self.namespace, browser_session=self.browser_session
			)

			screenshot = state.screenshot if include_screenshot else None
			return browser_state_text, screenshot

		except Exception as e:
			logger.error(f'Failed to get browser state: {e}')
			return f'Error getting browser state: {e}', None

	def _format_execution_result(self, code: str, output: str | None, error: str | None, current_step: int | None = None) -> str:
		"""Format the execution result for the LLM (without browser state)."""
		result = []

		# Add step progress header if step number provided
		if current_step is not None:
			progress_header = f'Step {current_step}/{self.max_steps} executed'
			# Add consecutive failure tracking if there are errors
			if error and self._consecutive_errors > 0:
				progress_header += f' | Consecutive failures: {self._consecutive_errors}/{self.max_failures}'
			result.append(progress_header)

		if error:
			result.append(f'Error: {error}')

		if output:
			# Truncate output if too long
			if len(output) > 10000:
				output = output[:9950] + '\n[Truncated after 10000 characters]'
			result.append(f'Output: {output}')
		if len(result) == 0:
			result.append('Executed')
		return '\n'.join(result)

	def _is_task_done(self) -> bool:
		"""Check if the task is marked as done in the namespace."""
		# Check if 'done' was called by looking for a special marker in namespace
		return self.namespace.get('_task_done', False)

	async def _capture_screenshot(self, step_number: int) -> str | None:
		"""Capture and store screenshot for eval tracking."""
		if not self.browser_session:
			return None

		try:
			# Get browser state summary which includes screenshot
			state = await self.browser_session.get_browser_state_summary(include_screenshot=True)
			if state and state.screenshot:
				# Store screenshot using screenshot service
				screenshot_path = await self.screenshot_service.store_screenshot(state.screenshot, step_number)
				return str(screenshot_path) if screenshot_path else None
		except Exception as e:
			logger.warning(f'Failed to capture screenshot for step {step_number}: {e}')
			return None

	async def _add_step_to_complete_history(
		self,
		model_output_code: str,
		full_llm_response: str,
		output: str | None,
		error: str | None,
		screenshot_path: str | None,
	) -> None:
		"""Add a step to complete_history using type-safe models."""
		# Get current browser URL and title for state
		url: str | None = None
		title: str | None = None
		if self.browser_session:
			try:
				url = await self.browser_session.get_current_page_url()
				# Get title from browser
				cdp_session = await self.browser_session.get_or_create_cdp_session()
				result = await cdp_session.cdp_client.send.Runtime.evaluate(
					params={'expression': 'document.title', 'returnByValue': True},
					session_id=cdp_session.session_id,
				)
				title = result.get('result', {}).get('value')
			except Exception as e:
				logger.debug(f'Failed to get browser URL/title for history: {e}')

		# Check if this is a done result
		is_done = self._is_task_done()

		# Get self-reported success from done() call if task is done
		self_reported_success: bool | None = None
		if is_done:
			task_success = self.namespace.get('_task_success')
			self_reported_success = task_success if isinstance(task_success, bool) else None

		# Create result entry using typed model
		result_entry = CodeAgentResult(
			extracted_content=output if output else None,
			error=error if error else None,
			is_done=is_done,
			success=self_reported_success,
		)

		# Create state entry using typed model
		state_entry = CodeAgentState(url=url, title=title, screenshot_path=screenshot_path)

		# Create metadata entry using typed model
		step_end_time = datetime.datetime.now().timestamp()
		metadata_entry = CodeAgentStepMetadata(
			input_tokens=self._last_llm_usage.prompt_tokens if self._last_llm_usage else None,
			output_tokens=self._last_llm_usage.completion_tokens if self._last_llm_usage else None,
			step_start_time=self._step_start_time,
			step_end_time=step_end_time,
		)

		# Create model output entry using typed model (if there's code to track)
		model_output_entry: CodeAgentModelOutput | None = None
		if model_output_code or full_llm_response:
			model_output_entry = CodeAgentModelOutput(
				model_output=model_output_code if model_output_code else '',
				full_response=full_llm_response if full_llm_response else '',
			)

		# Create history entry using typed model
		history_entry = CodeAgentHistory(
			model_output=model_output_entry,
			result=[result_entry],
			state=state_entry,
			metadata=metadata_entry,
			screenshot_path=screenshot_path,  # Keep for backward compatibility
		)

		self.complete_history.append(history_entry)

	def _log_agent_event(self, max_steps: int, agent_run_error: str | None = None) -> None:
		"""Send the agent event for this run to telemetry."""
		from urllib.parse import urlparse

		token_summary = self.token_cost_service.get_usage_tokens_for_model(self.llm.model)

		# For CodeAgent, we don't have action history like Agent does
		# Instead we track the code execution cells
		action_history_data: list[list[dict[str, Any]] | None] = []
		for step in self.complete_history:
			# Extract code from model_output if available (type-safe access)
			if step.model_output and step.model_output.full_response:
				code = step.model_output.full_response
				# Represent each code cell as a simple action entry
				action_history_data.append([{'llm_response': code}])
			else:
				action_history_data.append(None)

		# Get final result from the last step or namespace (type-safe)
		final_result: Any = self.namespace.get('_task_result')
		final_result_str: str | None = final_result if isinstance(final_result, str) else None

		# Get URLs visited from complete_history (type-safe access)
		urls_visited: list[str] = []
		for step in self.complete_history:
			if step.state.url and step.state.url not in urls_visited:
				urls_visited.append(step.state.url)

		# Get errors from complete_history (type-safe access)
		errors: list[str] = []
		for step in self.complete_history:
			for result in step.result:
				if result.error:
					errors.append(result.error)

		# Determine success from task completion status (type-safe)
		is_done = self._is_task_done()
		task_success: Any = self.namespace.get('_task_success')
		self_reported_success: bool | None = task_success if isinstance(task_success, bool) else (False if is_done else None)

		self.telemetry.capture(
			AgentTelemetryEvent(
				task=self.task,
				model=self.llm.model,
				model_provider=self.llm.provider,
				max_steps=max_steps,
				max_actions_per_step=1,  # CodeAgent executes one code cell per step
				use_vision=self.use_vision,
				version=self.version,
				source=self.source,
				cdp_url=urlparse(self.browser_session.cdp_url).hostname
				if self.browser_session and self.browser_session.cdp_url
				else None,
				agent_type='code',  # CodeAgent identifier
				action_errors=errors,
				action_history=action_history_data,
				urls_visited=urls_visited,
				steps=len(self.complete_history),
				total_input_tokens=token_summary.prompt_tokens,
				total_output_tokens=token_summary.completion_tokens,
				prompt_cached_tokens=token_summary.prompt_cached_tokens,
				total_tokens=token_summary.total_tokens,
				total_duration_seconds=sum(step.metadata.duration_seconds for step in self.complete_history if step.metadata),
				success=self_reported_success,
				final_result_response=final_result_str,
				error_message=agent_run_error,
			)
		)

	def screenshot_paths(self, n_last: int | None = None) -> list[str | None]:
		"""
		Get screenshot paths from complete_history for eval system.

		Args:
			n_last: Optional number of last screenshots to return

		Returns:
			List of screenshot file paths (or None for missing screenshots)
		"""
		paths = [step.screenshot_path for step in self.complete_history]

		if n_last is not None:
			return paths[-n_last:] if len(paths) > n_last else paths

		return paths

	@property
	def message_manager(self) -> Any:
		"""
		Compatibility property for eval system.
		Returns a mock object with last_input_messages attribute.
		"""

		class MockMessageManager:
			def __init__(self, llm_messages: list[BaseMessage]) -> None:
				# Convert code-use LLM messages to format expected by eval system
				self.last_input_messages = llm_messages

		return MockMessageManager(self._llm_messages)

	@property
	def history(self) -> Any:
		"""
		Compatibility property for eval system.
		Returns a mock AgentHistoryList object with history attribute containing complete_history.
		This is what the eval system expects when it does: agent_history = agent.history
		"""

		class DictToObject:
			"""Convert dict to object with attribute access for eval compatibility."""

			def __init__(self, data: dict[str, Any]) -> None:
				for key, value in data.items():
					if isinstance(value, dict):
						setattr(self, key, DictToObject(value))
					elif isinstance(value, list):
						setattr(self, key, [DictToObject(item) if isinstance(item, dict) else item for item in value])
					else:
						setattr(self, key, value)

			def __getattr__(self, name: str) -> None:
				"""Provide safe attribute access with defaults for missing attributes."""
				# Return None for missing attributes instead of raising AttributeError
				# This handles cases where eval system checks attributes that CodeAgent doesn't set
				return None

			def model_dump(self) -> dict[str, Any]:
				"""Support model_dump() calls from eval system."""
				result = {}
				for key, value in self.__dict__.items():
					if isinstance(value, DictToObject):
						result[key] = value.model_dump()
					elif isinstance(value, list):
						result[key] = [item.model_dump() if isinstance(item, DictToObject) else item for item in value]
					else:
						result[key] = value
				return result

			def get_screenshot(self) -> str | None:
				"""Support get_screenshot() calls for state objects."""
				# Load screenshot from disk and return as base64 string (matching BrowserStateHistory implementation)
				if not hasattr(self, 'screenshot_path') or not self.screenshot_path:
					return None

				import base64
				from pathlib import Path

				path_obj = Path(self.screenshot_path)
				if not path_obj.exists():
					return None

				try:
					with open(path_obj, 'rb') as f:
						screenshot_data = f.read()
					return base64.b64encode(screenshot_data).decode('utf-8')
				except Exception:
					return None

		class MockAgentHistoryList:
			def __init__(self, complete_history: list[CodeAgentHistory], usage_summary: UsageSummary | None) -> None:
				# Convert each CodeAgentHistory to dict, then to object with attribute access
				self.history = [DictToObject(item.model_dump()) for item in complete_history]
				# Use the provided usage summary
				self.usage = usage_summary

		return MockAgentHistoryList(self.complete_history, self.usage_summary)

	async def close(self) -> None:
		"""Close the browser session."""
		if self.browser_session:
			# Check if we should close the browser based on keep_alive setting
			if not self.browser_session.browser_profile.keep_alive:
				await self.browser_session.kill()
			else:
				logger.debug('Browser keep_alive is True, not closing browser session')

	async def __aenter__(self) -> 'CodeAgent':
		"""Async context manager entry."""
		return self

	async def __aexit__(self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: Any) -> None:
		"""Async context manager exit."""
		await self.close()