First_agent_template

Sleeping

File size: 7,064 Bytes

from bs4 import BeautifulSoup
import json
from typing import Dict, List, Optional
from smolagents.tools import Tool
import re


class HTMLToWPBlocksTool(Tool):
    name = "html_to_wp_blocks"
    description = "Transforms HTML content into WordPress Gutenberg blocks"
    inputs = {
        'html_content': {'type': 'string', 'description': 'The HTML content to transform'},
        'preserve_classes': {
            'type': 'boolean',
            'description': 'Whether to preserve HTML class attributes as block attributes',
            'nullable': True
        }
    }
    output_type = "string"

    # Mapping of HTML elements to WordPress block names
    BLOCK_MAPPINGS = {
        'p': 'core/paragraph',
        'h1': 'core/heading',
        'h2': 'core/heading',
        'h3': 'core/heading',
        'h4': 'core/heading',
        'h5': 'core/heading',
        'h6': 'core/heading',
        'ul': 'core/list',
        'ol': 'core/list',
        'li': 'core/list-item',
        'img': 'core/image',
        'figure': 'core/image',
        'blockquote': 'core/quote',
        'pre': 'core/code',
        'code': 'core/code',
        'table': 'core/table',
    }

    def __init__(self):
        super().__init__()

    def _get_block_attributes(self, element) -> Dict:
        """Extract relevant attributes from HTML element for block attributes."""
        attrs = {}

        # Handle heading levels
        if element.name.startswith('h') and element.name[1].isdigit():
            attrs['level'] = int(element.name[1])

        # Handle alignment
        if 'class' in element.attrs:
            classes = element['class']
            alignments = ['alignleft', 'alignright',
                          'aligncenter', 'alignwide', 'alignfull']
            for align in alignments:
                if align in classes:
                    attrs['align'] = align.replace('align', '')

        # Handle images
        if element.name == 'img':
            attrs['url'] = element.get('src', '')
            if element.get('alt'):
                attrs['alt'] = element['alt']

        return attrs

    def _element_to_block(self, element, preserve_classes: bool = False) -> str:
        """Convert a single HTML element to a WordPress block."""
        if element.name not in self.BLOCK_MAPPINGS:
            return str(element)

        block_name = self.BLOCK_MAPPINGS[element.name]
        attrs = self._get_block_attributes(element)

        if preserve_classes and 'class' in element.attrs:
            attrs['className'] = ' '.join(element['class'])

        # Handle nested content
        inner_content = element.decode_contents().strip() if element.contents else ""

        # Create block comment wrapper
        block_start = f'<!-- wp:{block_name.replace("core/", "")}'
        if attrs:
            block_start += f' {json.dumps(attrs)}'
        block_start += ' -->'

        # Wrap content in appropriate HTML tag
        if element.name == 'p':
            content = f'<p>{inner_content}</p>'
        elif element.name.startswith('h'):
            level = attrs.get('level', int(element.name[1]))
            content = f'<{element.name} class="wp-block-heading">{inner_content}</{element.name}>'
        elif element.name == 'img':
            content = str(element)  # Keep original img tag
        elif element.name in ['ul', 'ol']:
            content = f'<{element.name}>{inner_content}</{element.name}>'
        elif element.name == 'li':
            content = f'<li>{inner_content}</li>'
        elif element.name == 'blockquote':
            content = f'<blockquote class="wp-block-quote">{inner_content}</blockquote>'
        elif element.name in ['pre', 'code']:
            content = f'<{element.name}>{inner_content}</{element.name}>'
        else:
            content = inner_content

        block_end = f'<!-- /wp:{block_name.replace("core/", "")} -->'

        return f'{block_start}\n{content}\n{block_end}'

    def forward(self, html_content: str, preserve_classes: bool = False) -> str:
        """Transform HTML content into WordPress blocks

        Args:
            html_content: The HTML content to transform
            preserve_classes: Whether to preserve HTML class attributes

        Returns:
            String containing the WordPress block representation
        """
        try:
            # Handle input that might be a dictionary
            if isinstance(html_content, dict):
                html_content = html_content.get('content', '')

            # Ensure html_content is a string
            html_content = str(
                html_content) if html_content is not None else ""

            # Remove DOCTYPE, html, head, body tags and their content
            html_content = re.sub(r'<!DOCTYPE[^>]*>', '', html_content)
            html_content = re.sub(
                r'<html[^>]*>.*?<body[^>]*>', '', html_content, flags=re.DOTALL)
            html_content = re.sub(r'</body>.*?</html>',
                                  '', html_content, flags=re.DOTALL)

            # Create BeautifulSoup object with error handling
            soup = BeautifulSoup(
                html_content, 'html.parser', from_encoding='utf-8')

            # Remove style tags and their content
            for style in soup.find_all('style'):
                style.decompose()

            # Remove container divs but keep their content
            for div in soup.find_all('div', class_='container'):
                div.unwrap()

            # Remove the first h1 tag as it's used as the post title
            first_h1 = soup.find('h1')
            if first_h1:
                first_h1.decompose()

            blocks = []
            found_first_paragraph = False

            # Process each top-level element
            for element in soup.find_all(recursive=False):
                if element.name:  # Skip NavigableString objects
                    try:
                        block = self._element_to_block(
                            element, preserve_classes)
                        blocks.append(block)

                        # Insert "more" block after first paragraph
                        if not found_first_paragraph and element.name == 'p':
                            found_first_paragraph = True
                            blocks.append(
                                '<!-- wp:more -->\n<!--more-->\n<!-- /wp:more -->')

                    except Exception as e:
                        print(
                            f"Warning: Failed to process element {element.name}: {str(e)}")
                        # Fallback to string representation
                        blocks.append(str(element))

            return '\n\n'.join(blocks)
        except Exception as e:
            print(f"Error converting HTML to blocks: {str(e)}")
            # Return sanitized original content as fallback
            if isinstance(html_content, dict):
                html_content = str(html_content.get('content', ''))
            return html_content.replace('<', '&lt;').replace('>', '&gt;')