from bs4 import BeautifulSoup import json from typing import Dict, List, Optional from smolagents.tools import Tool import re class HTMLToWPBlocksTool(Tool): name = "html_to_wp_blocks" description = "Transforms HTML content into WordPress Gutenberg blocks" inputs = { 'html_content': {'type': 'string', 'description': 'The HTML content to transform'}, 'preserve_classes': { 'type': 'boolean', 'description': 'Whether to preserve HTML class attributes as block attributes', 'nullable': True } } output_type = "string" # Mapping of HTML elements to WordPress block names BLOCK_MAPPINGS = { 'p': 'core/paragraph', 'h1': 'core/heading', 'h2': 'core/heading', 'h3': 'core/heading', 'h4': 'core/heading', 'h5': 'core/heading', 'h6': 'core/heading', 'ul': 'core/list', 'ol': 'core/list', 'li': 'core/list-item', 'img': 'core/image', 'figure': 'core/image', 'blockquote': 'core/quote', 'pre': 'core/code', 'code': 'core/code', 'table': 'core/table', } def __init__(self): super().__init__() def _get_block_attributes(self, element) -> Dict: """Extract relevant attributes from HTML element for block attributes.""" attrs = {} # Handle heading levels if element.name.startswith('h') and element.name[1].isdigit(): attrs['level'] = int(element.name[1]) # Handle alignment if 'class' in element.attrs: classes = element['class'] alignments = ['alignleft', 'alignright', 'aligncenter', 'alignwide', 'alignfull'] for align in alignments: if align in classes: attrs['align'] = align.replace('align', '') # Handle images if element.name == 'img': attrs['url'] = element.get('src', '') if element.get('alt'): attrs['alt'] = element['alt'] return attrs def _element_to_block(self, element, preserve_classes: bool = False) -> str: """Convert a single HTML element to a WordPress block.""" if element.name not in self.BLOCK_MAPPINGS: return str(element) block_name = self.BLOCK_MAPPINGS[element.name] attrs = self._get_block_attributes(element) if preserve_classes and 'class' in element.attrs: attrs['className'] = ' '.join(element['class']) # Handle nested content inner_content = element.decode_contents().strip() if element.contents else "" # Create block comment wrapper block_start = f'' # Wrap content in appropriate HTML tag if element.name == 'p': content = f'
{inner_content}
' elif element.name.startswith('h'): level = attrs.get('level', int(element.name[1])) content = f'<{element.name} class="wp-block-heading">{inner_content}{element.name}>' elif element.name == 'img': content = str(element) # Keep original img tag elif element.name in ['ul', 'ol']: content = f'<{element.name}>{inner_content}{element.name}>' elif element.name == 'li': content = f'{inner_content}' elif element.name in ['pre', 'code']: content = f'<{element.name}>{inner_content}{element.name}>' else: content = inner_content block_end = f'' return f'{block_start}\n{content}\n{block_end}' def forward(self, html_content: str, preserve_classes: bool = False) -> str: """Transform HTML content into WordPress blocks Args: html_content: The HTML content to transform preserve_classes: Whether to preserve HTML class attributes Returns: String containing the WordPress block representation """ try: # Handle input that might be a dictionary if isinstance(html_content, dict): html_content = html_content.get('content', '') # Ensure html_content is a string html_content = str( html_content) if html_content is not None else "" # Remove DOCTYPE, html, head, body tags and their content html_content = re.sub(r']*>', '', html_content) html_content = re.sub( r']*>.*?]*>', '', html_content, flags=re.DOTALL) html_content = re.sub(r'.*?', '', html_content, flags=re.DOTALL) # Create BeautifulSoup object with error handling soup = BeautifulSoup( html_content, 'html.parser', from_encoding='utf-8') # Remove style tags and their content for style in soup.find_all('style'): style.decompose() # Remove container divs but keep their content for div in soup.find_all('div', class_='container'): div.unwrap() # Remove the first h1 tag as it's used as the post title first_h1 = soup.find('h1') if first_h1: first_h1.decompose() blocks = [] found_first_paragraph = False # Process each top-level element for element in soup.find_all(recursive=False): if element.name: # Skip NavigableString objects try: block = self._element_to_block( element, preserve_classes) blocks.append(block) # Insert "more" block after first paragraph if not found_first_paragraph and element.name == 'p': found_first_paragraph = True blocks.append( '\n\n') except Exception as e: print( f"Warning: Failed to process element {element.name}: {str(e)}") # Fallback to string representation blocks.append(str(element)) return '\n\n'.join(blocks) except Exception as e: print(f"Error converting HTML to blocks: {str(e)}") # Return sanitized original content as fallback if isinstance(html_content, dict): html_content = str(html_content.get('content', '')) return html_content.replace('<', '<').replace('>', '>')