File size: 7,064 Bytes
0c3a95b
 
 
 
b2c76c3
0c3a95b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d63fdd5
0c3a95b
 
 
 
d63fdd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c3a95b
 
 
 
 
 
 
 
 
 
 
b2c76c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc81f69
 
 
 
 
b2c76c3
cfebff8
b2c76c3
 
 
 
 
 
 
 
cfebff8
 
 
 
 
 
 
b2c76c3
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from bs4 import BeautifulSoup
import json
from typing import Dict, List, Optional
from smolagents.tools import Tool
import re


class HTMLToWPBlocksTool(Tool):
    name = "html_to_wp_blocks"
    description = "Transforms HTML content into WordPress Gutenberg blocks"
    inputs = {
        'html_content': {'type': 'string', 'description': 'The HTML content to transform'},
        'preserve_classes': {
            'type': 'boolean',
            'description': 'Whether to preserve HTML class attributes as block attributes',
            'nullable': True
        }
    }
    output_type = "string"

    # Mapping of HTML elements to WordPress block names
    BLOCK_MAPPINGS = {
        'p': 'core/paragraph',
        'h1': 'core/heading',
        'h2': 'core/heading',
        'h3': 'core/heading',
        'h4': 'core/heading',
        'h5': 'core/heading',
        'h6': 'core/heading',
        'ul': 'core/list',
        'ol': 'core/list',
        'li': 'core/list-item',
        'img': 'core/image',
        'figure': 'core/image',
        'blockquote': 'core/quote',
        'pre': 'core/code',
        'code': 'core/code',
        'table': 'core/table',
    }

    def __init__(self):
        super().__init__()

    def _get_block_attributes(self, element) -> Dict:
        """Extract relevant attributes from HTML element for block attributes."""
        attrs = {}

        # Handle heading levels
        if element.name.startswith('h') and element.name[1].isdigit():
            attrs['level'] = int(element.name[1])

        # Handle alignment
        if 'class' in element.attrs:
            classes = element['class']
            alignments = ['alignleft', 'alignright',
                          'aligncenter', 'alignwide', 'alignfull']
            for align in alignments:
                if align in classes:
                    attrs['align'] = align.replace('align', '')

        # Handle images
        if element.name == 'img':
            attrs['url'] = element.get('src', '')
            if element.get('alt'):
                attrs['alt'] = element['alt']

        return attrs

    def _element_to_block(self, element, preserve_classes: bool = False) -> str:
        """Convert a single HTML element to a WordPress block."""
        if element.name not in self.BLOCK_MAPPINGS:
            return str(element)

        block_name = self.BLOCK_MAPPINGS[element.name]
        attrs = self._get_block_attributes(element)

        if preserve_classes and 'class' in element.attrs:
            attrs['className'] = ' '.join(element['class'])

        # Handle nested content
        inner_content = element.decode_contents().strip() if element.contents else ""

        # Create block comment wrapper
        block_start = f'<!-- wp:{block_name.replace("core/", "")}'
        if attrs:
            block_start += f' {json.dumps(attrs)}'
        block_start += ' -->'

        # Wrap content in appropriate HTML tag
        if element.name == 'p':
            content = f'<p>{inner_content}</p>'
        elif element.name.startswith('h'):
            level = attrs.get('level', int(element.name[1]))
            content = f'<{element.name} class="wp-block-heading">{inner_content}</{element.name}>'
        elif element.name == 'img':
            content = str(element)  # Keep original img tag
        elif element.name in ['ul', 'ol']:
            content = f'<{element.name}>{inner_content}</{element.name}>'
        elif element.name == 'li':
            content = f'<li>{inner_content}</li>'
        elif element.name == 'blockquote':
            content = f'<blockquote class="wp-block-quote">{inner_content}</blockquote>'
        elif element.name in ['pre', 'code']:
            content = f'<{element.name}>{inner_content}</{element.name}>'
        else:
            content = inner_content

        block_end = f'<!-- /wp:{block_name.replace("core/", "")} -->'

        return f'{block_start}\n{content}\n{block_end}'

    def forward(self, html_content: str, preserve_classes: bool = False) -> str:
        """Transform HTML content into WordPress blocks

        Args:
            html_content: The HTML content to transform
            preserve_classes: Whether to preserve HTML class attributes

        Returns:
            String containing the WordPress block representation
        """
        try:
            # Handle input that might be a dictionary
            if isinstance(html_content, dict):
                html_content = html_content.get('content', '')

            # Ensure html_content is a string
            html_content = str(
                html_content) if html_content is not None else ""

            # Remove DOCTYPE, html, head, body tags and their content
            html_content = re.sub(r'<!DOCTYPE[^>]*>', '', html_content)
            html_content = re.sub(
                r'<html[^>]*>.*?<body[^>]*>', '', html_content, flags=re.DOTALL)
            html_content = re.sub(r'</body>.*?</html>',
                                  '', html_content, flags=re.DOTALL)

            # Create BeautifulSoup object with error handling
            soup = BeautifulSoup(
                html_content, 'html.parser', from_encoding='utf-8')

            # Remove style tags and their content
            for style in soup.find_all('style'):
                style.decompose()

            # Remove container divs but keep their content
            for div in soup.find_all('div', class_='container'):
                div.unwrap()

            # Remove the first h1 tag as it's used as the post title
            first_h1 = soup.find('h1')
            if first_h1:
                first_h1.decompose()

            blocks = []
            found_first_paragraph = False

            # Process each top-level element
            for element in soup.find_all(recursive=False):
                if element.name:  # Skip NavigableString objects
                    try:
                        block = self._element_to_block(
                            element, preserve_classes)
                        blocks.append(block)

                        # Insert "more" block after first paragraph
                        if not found_first_paragraph and element.name == 'p':
                            found_first_paragraph = True
                            blocks.append(
                                '<!-- wp:more -->\n<!--more-->\n<!-- /wp:more -->')

                    except Exception as e:
                        print(
                            f"Warning: Failed to process element {element.name}: {str(e)}")
                        # Fallback to string representation
                        blocks.append(str(element))

            return '\n\n'.join(blocks)
        except Exception as e:
            print(f"Error converting HTML to blocks: {str(e)}")
            # Return sanitized original content as fallback
            if isinstance(html_content, dict):
                html_content = str(html_content.get('content', ''))
            return html_content.replace('<', '&lt;').replace('>', '&gt;')