First_agent_template / tools /html_to_wp_blocks.py
juanmaguitar's picture
Added more block after first paragraph
cfebff8
from bs4 import BeautifulSoup
import json
from typing import Dict, List, Optional
from smolagents.tools import Tool
import re
class HTMLToWPBlocksTool(Tool):
name = "html_to_wp_blocks"
description = "Transforms HTML content into WordPress Gutenberg blocks"
inputs = {
'html_content': {'type': 'string', 'description': 'The HTML content to transform'},
'preserve_classes': {
'type': 'boolean',
'description': 'Whether to preserve HTML class attributes as block attributes',
'nullable': True
}
}
output_type = "string"
# Mapping of HTML elements to WordPress block names
BLOCK_MAPPINGS = {
'p': 'core/paragraph',
'h1': 'core/heading',
'h2': 'core/heading',
'h3': 'core/heading',
'h4': 'core/heading',
'h5': 'core/heading',
'h6': 'core/heading',
'ul': 'core/list',
'ol': 'core/list',
'li': 'core/list-item',
'img': 'core/image',
'figure': 'core/image',
'blockquote': 'core/quote',
'pre': 'core/code',
'code': 'core/code',
'table': 'core/table',
}
def __init__(self):
super().__init__()
def _get_block_attributes(self, element) -> Dict:
"""Extract relevant attributes from HTML element for block attributes."""
attrs = {}
# Handle heading levels
if element.name.startswith('h') and element.name[1].isdigit():
attrs['level'] = int(element.name[1])
# Handle alignment
if 'class' in element.attrs:
classes = element['class']
alignments = ['alignleft', 'alignright',
'aligncenter', 'alignwide', 'alignfull']
for align in alignments:
if align in classes:
attrs['align'] = align.replace('align', '')
# Handle images
if element.name == 'img':
attrs['url'] = element.get('src', '')
if element.get('alt'):
attrs['alt'] = element['alt']
return attrs
def _element_to_block(self, element, preserve_classes: bool = False) -> str:
"""Convert a single HTML element to a WordPress block."""
if element.name not in self.BLOCK_MAPPINGS:
return str(element)
block_name = self.BLOCK_MAPPINGS[element.name]
attrs = self._get_block_attributes(element)
if preserve_classes and 'class' in element.attrs:
attrs['className'] = ' '.join(element['class'])
# Handle nested content
inner_content = element.decode_contents().strip() if element.contents else ""
# Create block comment wrapper
block_start = f'<!-- wp:{block_name.replace("core/", "")}'
if attrs:
block_start += f' {json.dumps(attrs)}'
block_start += ' -->'
# Wrap content in appropriate HTML tag
if element.name == 'p':
content = f'<p>{inner_content}</p>'
elif element.name.startswith('h'):
level = attrs.get('level', int(element.name[1]))
content = f'<{element.name} class="wp-block-heading">{inner_content}</{element.name}>'
elif element.name == 'img':
content = str(element) # Keep original img tag
elif element.name in ['ul', 'ol']:
content = f'<{element.name}>{inner_content}</{element.name}>'
elif element.name == 'li':
content = f'<li>{inner_content}</li>'
elif element.name == 'blockquote':
content = f'<blockquote class="wp-block-quote">{inner_content}</blockquote>'
elif element.name in ['pre', 'code']:
content = f'<{element.name}>{inner_content}</{element.name}>'
else:
content = inner_content
block_end = f'<!-- /wp:{block_name.replace("core/", "")} -->'
return f'{block_start}\n{content}\n{block_end}'
def forward(self, html_content: str, preserve_classes: bool = False) -> str:
"""Transform HTML content into WordPress blocks
Args:
html_content: The HTML content to transform
preserve_classes: Whether to preserve HTML class attributes
Returns:
String containing the WordPress block representation
"""
try:
# Handle input that might be a dictionary
if isinstance(html_content, dict):
html_content = html_content.get('content', '')
# Ensure html_content is a string
html_content = str(
html_content) if html_content is not None else ""
# Remove DOCTYPE, html, head, body tags and their content
html_content = re.sub(r'<!DOCTYPE[^>]*>', '', html_content)
html_content = re.sub(
r'<html[^>]*>.*?<body[^>]*>', '', html_content, flags=re.DOTALL)
html_content = re.sub(r'</body>.*?</html>',
'', html_content, flags=re.DOTALL)
# Create BeautifulSoup object with error handling
soup = BeautifulSoup(
html_content, 'html.parser', from_encoding='utf-8')
# Remove style tags and their content
for style in soup.find_all('style'):
style.decompose()
# Remove container divs but keep their content
for div in soup.find_all('div', class_='container'):
div.unwrap()
# Remove the first h1 tag as it's used as the post title
first_h1 = soup.find('h1')
if first_h1:
first_h1.decompose()
blocks = []
found_first_paragraph = False
# Process each top-level element
for element in soup.find_all(recursive=False):
if element.name: # Skip NavigableString objects
try:
block = self._element_to_block(
element, preserve_classes)
blocks.append(block)
# Insert "more" block after first paragraph
if not found_first_paragraph and element.name == 'p':
found_first_paragraph = True
blocks.append(
'<!-- wp:more -->\n<!--more-->\n<!-- /wp:more -->')
except Exception as e:
print(
f"Warning: Failed to process element {element.name}: {str(e)}")
# Fallback to string representation
blocks.append(str(element))
return '\n\n'.join(blocks)
except Exception as e:
print(f"Error converting HTML to blocks: {str(e)}")
# Return sanitized original content as fallback
if isinstance(html_content, dict):
html_content = str(html_content.get('content', ''))
return html_content.replace('<', '&lt;').replace('>', '&gt;')