Spaces:

Speedofmastery
/

HMM

Sleeping

App Files Files Community

HMM / browser-use-main /browser_use /code_use /utils.py

Speedofmastery

Merge Landrun + Browser-Use + Chromium with AI agent support (without binary files)

d7b3d84 3 months ago

raw

history blame contribute delete

5.25 kB

	"""Utility functions for code-use agent."""

	import re


	def truncate_message_content(content: str, max_length: int = 10000) -> str:
	"""Truncate message content to max_length characters for history."""
	if len(content) <= max_length:
	return content
	# Truncate and add marker
	return content[:max_length] + f'\n\n[... truncated {len(content) - max_length} characters for history]'


	def detect_token_limit_issue(
	completion: str,
	completion_tokens: int \| None,
	max_tokens: int \| None,
	stop_reason: str \| None,
	) -> tuple[bool, str \| None]:
	"""
	Detect if the LLM response hit token limits or is repetitive garbage.

	Returns: (is_problematic, error_message)
	"""
	# Check 1: Stop reason indicates max_tokens
	if stop_reason == 'max_tokens':
	return True, f'Response terminated due to max_tokens limit (stop_reason: {stop_reason})'

	# Check 2: Used 90%+ of max_tokens (if we have both values)
	if completion_tokens is not None and max_tokens is not None and max_tokens > 0:
	usage_ratio = completion_tokens / max_tokens
	if usage_ratio >= 0.9:
	return True, f'Response used {usage_ratio:.1%} of max_tokens ({completion_tokens}/{max_tokens})'

	# Check 3: Last 6 characters repeat 40+ times (repetitive garbage)
	if len(completion) >= 6:
	last_6 = completion[-6:]
	repetition_count = completion.count(last_6)
	if repetition_count >= 40:
	return True, f'Repetitive output detected: last 6 chars "{last_6}" appears {repetition_count} times'

	return False, None


	def extract_url_from_task(task: str) -> str \| None:
	"""Extract URL from task string using naive pattern matching."""
	# Remove email addresses from task before looking for URLs
	task_without_emails = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b', '', task)

	# Look for common URL patterns
	patterns = [
	r'https?://[^\s<>"\']+', # Full URLs with http/https
	r'(?:www\.)?[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)\.[a-zA-Z]{2,}(?:/[^\s<>"\'])?', # Domain names with subdomains and optional paths
	]

	found_urls = []
	for pattern in patterns:
	matches = re.finditer(pattern, task_without_emails)
	for match in matches:
	url = match.group(0)

	# Remove trailing punctuation that's not part of URLs
	url = re.sub(r'[.,;:!?()\[\]]+$', '', url)
	# Add https:// if missing
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url
	found_urls.append(url)

	unique_urls = list(set(found_urls))
	# If multiple URLs found, skip auto-navigation to avoid ambiguity
	if len(unique_urls) > 1:
	return None

	# If exactly one URL found, return it
	if len(unique_urls) == 1:
	return unique_urls[0]

	return None


	def extract_code_blocks(text: str) -> dict[str, str]:
	"""Extract all code blocks from markdown response.

	Supports:
	- ```python, ```js, ```javascript, ```bash, ```markdown, ```md
	- Named blocks: ```js variable_name → saved as 'variable_name' in namespace
	- Nested blocks: Use 4+ backticks for outer block when inner content has 3 backticks

	Returns dict mapping block_name -> content

	Note: Python blocks are NO LONGER COMBINED. Each python block executes separately
	to allow sequential execution with JS/bash blocks in between.
	"""
	# Pattern to match code blocks with language identifier and optional variable name
	# Matches: ```lang\n or ```lang varname\n or ````+lang\n (4+ backticks for nested blocks)
	# Uses non-greedy matching and backreferences to match opening/closing backticks
	pattern = r'(`{3,})(\w+)(?:\s+(\w+))?\n(.*?)\1(?:\n\|$)'
	matches = re.findall(pattern, text, re.DOTALL)

	blocks: dict[str, str] = {}
	python_block_counter = 0

	for backticks, lang, var_name, content in matches:
	lang = lang.lower()

	# Normalize language names
	if lang in ('javascript', 'js'):
	lang_normalized = 'js'
	elif lang in ('markdown', 'md'):
	lang_normalized = 'markdown'
	elif lang in ('sh', 'shell'):
	lang_normalized = 'bash'
	elif lang == 'python':
	lang_normalized = 'python'
	else:
	# Unknown language, skip
	continue

	# Only process supported types
	if lang_normalized in ('python', 'js', 'bash', 'markdown'):
	content = content.rstrip() # Only strip trailing whitespace, preserve leading for indentation
	if content:
	# Determine the key to use
	if var_name:
	# Named block - use the variable name
	block_key = var_name
	blocks[block_key] = content
	elif lang_normalized == 'python':
	# Unnamed Python blocks - give each a unique key to preserve order
	block_key = f'python_{python_block_counter}'
	blocks[block_key] = content
	python_block_counter += 1
	else:
	# Other unnamed blocks (js, bash, markdown) - keep last one only
	blocks[lang_normalized] = content

	# If we have multiple python blocks, mark the first one as 'python' for backward compat
	if python_block_counter > 0:
	blocks['python'] = blocks['python_0']

	# Fallback: if no python block but there's generic ``` block, treat as python
	if python_block_counter == 0 and 'python' not in blocks:
	generic_pattern = r'```\n(.*?)```'
	generic_matches = re.findall(generic_pattern, text, re.DOTALL)
	if generic_matches:
	combined = '\n\n'.join(m.strip() for m in generic_matches if m.strip())
	if combined:
	blocks['python'] = combined

	return blocks