First_agent_template

Sleeping

App Files Files Community

First_agent_template / tools /html_to_wp_blocks.py

juanmaguitar

Added more block after first paragraph

cfebff8 12 months ago

raw

history blame contribute delete

7.06 kB

	from bs4 import BeautifulSoup
	import json
	from typing import Dict, List, Optional
	from smolagents.tools import Tool
	import re


	class HTMLToWPBlocksTool(Tool):
	name = "html_to_wp_blocks"
	description = "Transforms HTML content into WordPress Gutenberg blocks"
	inputs = {
	'html_content': {'type': 'string', 'description': 'The HTML content to transform'},
	'preserve_classes': {
	'type': 'boolean',
	'description': 'Whether to preserve HTML class attributes as block attributes',
	'nullable': True
	}
	}
	output_type = "string"

	# Mapping of HTML elements to WordPress block names
	BLOCK_MAPPINGS = {
	'p': 'core/paragraph',
	'h1': 'core/heading',
	'h2': 'core/heading',
	'h3': 'core/heading',
	'h4': 'core/heading',
	'h5': 'core/heading',
	'h6': 'core/heading',
	'ul': 'core/list',
	'ol': 'core/list',
	'li': 'core/list-item',
	'img': 'core/image',
	'figure': 'core/image',
	'blockquote': 'core/quote',
	'pre': 'core/code',
	'code': 'core/code',
	'table': 'core/table',
	}

	def __init__(self):
	super().__init__()

	def _get_block_attributes(self, element) -> Dict:
	"""Extract relevant attributes from HTML element for block attributes."""
	attrs = {}

	# Handle heading levels
	if element.name.startswith('h') and element.name[1].isdigit():
	attrs['level'] = int(element.name[1])

	# Handle alignment
	if 'class' in element.attrs:
	classes = element['class']
	alignments = ['alignleft', 'alignright',
	'aligncenter', 'alignwide', 'alignfull']
	for align in alignments:
	if align in classes:
	attrs['align'] = align.replace('align', '')

	# Handle images
	if element.name == 'img':
	attrs['url'] = element.get('src', '')
	if element.get('alt'):
	attrs['alt'] = element['alt']

	return attrs

	def _element_to_block(self, element, preserve_classes: bool = False) -> str:
	"""Convert a single HTML element to a WordPress block."""
	if element.name not in self.BLOCK_MAPPINGS:
	return str(element)

	block_name = self.BLOCK_MAPPINGS[element.name]
	attrs = self._get_block_attributes(element)

	if preserve_classes and 'class' in element.attrs:
	attrs['className'] = ' '.join(element['class'])

	# Handle nested content
	inner_content = element.decode_contents().strip() if element.contents else ""

	# Create block comment wrapper
	block_start = f'<!-- wp:{block_name.replace("core/", "")}'
	if attrs:
	block_start += f' {json.dumps(attrs)}'
	block_start += ' -->'

	# Wrap content in appropriate HTML tag
	if element.name == 'p':
	content = f'<p>{inner_content}</p>'
	elif element.name.startswith('h'):
	level = attrs.get('level', int(element.name[1]))
	content = f'<{element.name} class="wp-block-heading">{inner_content}</{element.name}>'
	elif element.name == 'img':
	content = str(element) # Keep original img tag
	elif element.name in ['ul', 'ol']:
	content = f'<{element.name}>{inner_content}</{element.name}>'
	elif element.name == 'li':
	content = f'<li>{inner_content}</li>'
	elif element.name == 'blockquote':
	content = f'<blockquote class="wp-block-quote">{inner_content}</blockquote>'
	elif element.name in ['pre', 'code']:
	content = f'<{element.name}>{inner_content}</{element.name}>'
	else:
	content = inner_content

	block_end = f'<!-- /wp:{block_name.replace("core/", "")} -->'

	return f'{block_start}\n{content}\n{block_end}'

	def forward(self, html_content: str, preserve_classes: bool = False) -> str:
	"""Transform HTML content into WordPress blocks

	Args:
	html_content: The HTML content to transform
	preserve_classes: Whether to preserve HTML class attributes

	Returns:
	String containing the WordPress block representation
	"""
	try:
	# Handle input that might be a dictionary
	if isinstance(html_content, dict):
	html_content = html_content.get('content', '')

	# Ensure html_content is a string
	html_content = str(
	html_content) if html_content is not None else ""

	# Remove DOCTYPE, html, head, body tags and their content
	html_content = re.sub(r'<!DOCTYPE[^>]*>', '', html_content)
	html_content = re.sub(
	r'<html[^>]>.?<body[^>]*>', '', html_content, flags=re.DOTALL)
	html_content = re.sub(r'</body>.*?</html>',
	'', html_content, flags=re.DOTALL)

	# Create BeautifulSoup object with error handling
	soup = BeautifulSoup(
	html_content, 'html.parser', from_encoding='utf-8')

	# Remove style tags and their content
	for style in soup.find_all('style'):
	style.decompose()

	# Remove container divs but keep their content
	for div in soup.find_all('div', class_='container'):
	div.unwrap()

	# Remove the first h1 tag as it's used as the post title
	first_h1 = soup.find('h1')
	if first_h1:
	first_h1.decompose()

	blocks = []
	found_first_paragraph = False

	# Process each top-level element
	for element in soup.find_all(recursive=False):
	if element.name: # Skip NavigableString objects
	try:
	block = self._element_to_block(
	element, preserve_classes)
	blocks.append(block)

	# Insert "more" block after first paragraph
	if not found_first_paragraph and element.name == 'p':
	found_first_paragraph = True
	blocks.append(
	'<!-- wp:more -->\n<!--more-->\n<!-- /wp:more -->')

	except Exception as e:
	print(
	f"Warning: Failed to process element {element.name}: {str(e)}")
	# Fallback to string representation
	blocks.append(str(element))

	return '\n\n'.join(blocks)
	except Exception as e:
	print(f"Error converting HTML to blocks: {str(e)}")
	# Return sanitized original content as fallback
	if isinstance(html_content, dict):
	html_content = str(html_content.get('content', ''))
	return html_content.replace('<', '<').replace('>', '>')