Spaces:
Sleeping
Sleeping
| """Utility functions for code-use agent.""" | |
| import re | |
| def truncate_message_content(content: str, max_length: int = 10000) -> str: | |
| """Truncate message content to max_length characters for history.""" | |
| if len(content) <= max_length: | |
| return content | |
| # Truncate and add marker | |
| return content[:max_length] + f'\n\n[... truncated {len(content) - max_length} characters for history]' | |
| def detect_token_limit_issue( | |
| completion: str, | |
| completion_tokens: int | None, | |
| max_tokens: int | None, | |
| stop_reason: str | None, | |
| ) -> tuple[bool, str | None]: | |
| """ | |
| Detect if the LLM response hit token limits or is repetitive garbage. | |
| Returns: (is_problematic, error_message) | |
| """ | |
| # Check 1: Stop reason indicates max_tokens | |
| if stop_reason == 'max_tokens': | |
| return True, f'Response terminated due to max_tokens limit (stop_reason: {stop_reason})' | |
| # Check 2: Used 90%+ of max_tokens (if we have both values) | |
| if completion_tokens is not None and max_tokens is not None and max_tokens > 0: | |
| usage_ratio = completion_tokens / max_tokens | |
| if usage_ratio >= 0.9: | |
| return True, f'Response used {usage_ratio:.1%} of max_tokens ({completion_tokens}/{max_tokens})' | |
| # Check 3: Last 6 characters repeat 40+ times (repetitive garbage) | |
| if len(completion) >= 6: | |
| last_6 = completion[-6:] | |
| repetition_count = completion.count(last_6) | |
| if repetition_count >= 40: | |
| return True, f'Repetitive output detected: last 6 chars "{last_6}" appears {repetition_count} times' | |
| return False, None | |
| def extract_url_from_task(task: str) -> str | None: | |
| """Extract URL from task string using naive pattern matching.""" | |
| # Remove email addresses from task before looking for URLs | |
| task_without_emails = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', task) | |
| # Look for common URL patterns | |
| patterns = [ | |
| r'https?://[^\s<>"\']+', # Full URLs with http/https | |
| r'(?:www\.)?[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?', # Domain names with subdomains and optional paths | |
| ] | |
| found_urls = [] | |
| for pattern in patterns: | |
| matches = re.finditer(pattern, task_without_emails) | |
| for match in matches: | |
| url = match.group(0) | |
| # Remove trailing punctuation that's not part of URLs | |
| url = re.sub(r'[.,;:!?()\[\]]+$', '', url) | |
| # Add https:// if missing | |
| if not url.startswith(('http://', 'https://')): | |
| url = 'https://' + url | |
| found_urls.append(url) | |
| unique_urls = list(set(found_urls)) | |
| # If multiple URLs found, skip auto-navigation to avoid ambiguity | |
| if len(unique_urls) > 1: | |
| return None | |
| # If exactly one URL found, return it | |
| if len(unique_urls) == 1: | |
| return unique_urls[0] | |
| return None | |
| def extract_code_blocks(text: str) -> dict[str, str]: | |
| """Extract all code blocks from markdown response. | |
| Supports: | |
| - ```python, ```js, ```javascript, ```bash, ```markdown, ```md | |
| - Named blocks: ```js variable_name → saved as 'variable_name' in namespace | |
| - Nested blocks: Use 4+ backticks for outer block when inner content has 3 backticks | |
| Returns dict mapping block_name -> content | |
| Note: Python blocks are NO LONGER COMBINED. Each python block executes separately | |
| to allow sequential execution with JS/bash blocks in between. | |
| """ | |
| # Pattern to match code blocks with language identifier and optional variable name | |
| # Matches: ```lang\n or ```lang varname\n or ````+lang\n (4+ backticks for nested blocks) | |
| # Uses non-greedy matching and backreferences to match opening/closing backticks | |
| pattern = r'(`{3,})(\w+)(?:\s+(\w+))?\n(.*?)\1(?:\n|$)' | |
| matches = re.findall(pattern, text, re.DOTALL) | |
| blocks: dict[str, str] = {} | |
| python_block_counter = 0 | |
| for backticks, lang, var_name, content in matches: | |
| lang = lang.lower() | |
| # Normalize language names | |
| if lang in ('javascript', 'js'): | |
| lang_normalized = 'js' | |
| elif lang in ('markdown', 'md'): | |
| lang_normalized = 'markdown' | |
| elif lang in ('sh', 'shell'): | |
| lang_normalized = 'bash' | |
| elif lang == 'python': | |
| lang_normalized = 'python' | |
| else: | |
| # Unknown language, skip | |
| continue | |
| # Only process supported types | |
| if lang_normalized in ('python', 'js', 'bash', 'markdown'): | |
| content = content.rstrip() # Only strip trailing whitespace, preserve leading for indentation | |
| if content: | |
| # Determine the key to use | |
| if var_name: | |
| # Named block - use the variable name | |
| block_key = var_name | |
| blocks[block_key] = content | |
| elif lang_normalized == 'python': | |
| # Unnamed Python blocks - give each a unique key to preserve order | |
| block_key = f'python_{python_block_counter}' | |
| blocks[block_key] = content | |
| python_block_counter += 1 | |
| else: | |
| # Other unnamed blocks (js, bash, markdown) - keep last one only | |
| blocks[lang_normalized] = content | |
| # If we have multiple python blocks, mark the first one as 'python' for backward compat | |
| if python_block_counter > 0: | |
| blocks['python'] = blocks['python_0'] | |
| # Fallback: if no python block but there's generic ``` block, treat as python | |
| if python_block_counter == 0 and 'python' not in blocks: | |
| generic_pattern = r'```\n(.*?)```' | |
| generic_matches = re.findall(generic_pattern, text, re.DOTALL) | |
| if generic_matches: | |
| combined = '\n\n'.join(m.strip() for m in generic_matches if m.strip()) | |
| if combined: | |
| blocks['python'] = combined | |
| return blocks | |