Spaces:
Sleeping
Sleeping
File size: 7,064 Bytes
0c3a95b b2c76c3 0c3a95b d63fdd5 0c3a95b d63fdd5 0c3a95b b2c76c3 fc81f69 b2c76c3 cfebff8 b2c76c3 cfebff8 b2c76c3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | from bs4 import BeautifulSoup
import json
from typing import Dict, List, Optional
from smolagents.tools import Tool
import re
class HTMLToWPBlocksTool(Tool):
name = "html_to_wp_blocks"
description = "Transforms HTML content into WordPress Gutenberg blocks"
inputs = {
'html_content': {'type': 'string', 'description': 'The HTML content to transform'},
'preserve_classes': {
'type': 'boolean',
'description': 'Whether to preserve HTML class attributes as block attributes',
'nullable': True
}
}
output_type = "string"
# Mapping of HTML elements to WordPress block names
BLOCK_MAPPINGS = {
'p': 'core/paragraph',
'h1': 'core/heading',
'h2': 'core/heading',
'h3': 'core/heading',
'h4': 'core/heading',
'h5': 'core/heading',
'h6': 'core/heading',
'ul': 'core/list',
'ol': 'core/list',
'li': 'core/list-item',
'img': 'core/image',
'figure': 'core/image',
'blockquote': 'core/quote',
'pre': 'core/code',
'code': 'core/code',
'table': 'core/table',
}
def __init__(self):
super().__init__()
def _get_block_attributes(self, element) -> Dict:
"""Extract relevant attributes from HTML element for block attributes."""
attrs = {}
# Handle heading levels
if element.name.startswith('h') and element.name[1].isdigit():
attrs['level'] = int(element.name[1])
# Handle alignment
if 'class' in element.attrs:
classes = element['class']
alignments = ['alignleft', 'alignright',
'aligncenter', 'alignwide', 'alignfull']
for align in alignments:
if align in classes:
attrs['align'] = align.replace('align', '')
# Handle images
if element.name == 'img':
attrs['url'] = element.get('src', '')
if element.get('alt'):
attrs['alt'] = element['alt']
return attrs
def _element_to_block(self, element, preserve_classes: bool = False) -> str:
"""Convert a single HTML element to a WordPress block."""
if element.name not in self.BLOCK_MAPPINGS:
return str(element)
block_name = self.BLOCK_MAPPINGS[element.name]
attrs = self._get_block_attributes(element)
if preserve_classes and 'class' in element.attrs:
attrs['className'] = ' '.join(element['class'])
# Handle nested content
inner_content = element.decode_contents().strip() if element.contents else ""
# Create block comment wrapper
block_start = f'<!-- wp:{block_name.replace("core/", "")}'
if attrs:
block_start += f' {json.dumps(attrs)}'
block_start += ' -->'
# Wrap content in appropriate HTML tag
if element.name == 'p':
content = f'<p>{inner_content}</p>'
elif element.name.startswith('h'):
level = attrs.get('level', int(element.name[1]))
content = f'<{element.name} class="wp-block-heading">{inner_content}</{element.name}>'
elif element.name == 'img':
content = str(element) # Keep original img tag
elif element.name in ['ul', 'ol']:
content = f'<{element.name}>{inner_content}</{element.name}>'
elif element.name == 'li':
content = f'<li>{inner_content}</li>'
elif element.name == 'blockquote':
content = f'<blockquote class="wp-block-quote">{inner_content}</blockquote>'
elif element.name in ['pre', 'code']:
content = f'<{element.name}>{inner_content}</{element.name}>'
else:
content = inner_content
block_end = f'<!-- /wp:{block_name.replace("core/", "")} -->'
return f'{block_start}\n{content}\n{block_end}'
def forward(self, html_content: str, preserve_classes: bool = False) -> str:
"""Transform HTML content into WordPress blocks
Args:
html_content: The HTML content to transform
preserve_classes: Whether to preserve HTML class attributes
Returns:
String containing the WordPress block representation
"""
try:
# Handle input that might be a dictionary
if isinstance(html_content, dict):
html_content = html_content.get('content', '')
# Ensure html_content is a string
html_content = str(
html_content) if html_content is not None else ""
# Remove DOCTYPE, html, head, body tags and their content
html_content = re.sub(r'<!DOCTYPE[^>]*>', '', html_content)
html_content = re.sub(
r'<html[^>]*>.*?<body[^>]*>', '', html_content, flags=re.DOTALL)
html_content = re.sub(r'</body>.*?</html>',
'', html_content, flags=re.DOTALL)
# Create BeautifulSoup object with error handling
soup = BeautifulSoup(
html_content, 'html.parser', from_encoding='utf-8')
# Remove style tags and their content
for style in soup.find_all('style'):
style.decompose()
# Remove container divs but keep their content
for div in soup.find_all('div', class_='container'):
div.unwrap()
# Remove the first h1 tag as it's used as the post title
first_h1 = soup.find('h1')
if first_h1:
first_h1.decompose()
blocks = []
found_first_paragraph = False
# Process each top-level element
for element in soup.find_all(recursive=False):
if element.name: # Skip NavigableString objects
try:
block = self._element_to_block(
element, preserve_classes)
blocks.append(block)
# Insert "more" block after first paragraph
if not found_first_paragraph and element.name == 'p':
found_first_paragraph = True
blocks.append(
'<!-- wp:more -->\n<!--more-->\n<!-- /wp:more -->')
except Exception as e:
print(
f"Warning: Failed to process element {element.name}: {str(e)}")
# Fallback to string representation
blocks.append(str(element))
return '\n\n'.join(blocks)
except Exception as e:
print(f"Error converting HTML to blocks: {str(e)}")
# Return sanitized original content as fallback
if isinstance(html_content, dict):
html_content = str(html_content.get('content', ''))
return html_content.replace('<', '<').replace('>', '>')
|