Spaces:
Sleeping
Sleeping
| from bs4 import BeautifulSoup | |
| import json | |
| from typing import Dict, List, Optional | |
| from smolagents.tools import Tool | |
| import re | |
| class HTMLToWPBlocksTool(Tool): | |
| name = "html_to_wp_blocks" | |
| description = "Transforms HTML content into WordPress Gutenberg blocks" | |
| inputs = { | |
| 'html_content': {'type': 'string', 'description': 'The HTML content to transform'}, | |
| 'preserve_classes': { | |
| 'type': 'boolean', | |
| 'description': 'Whether to preserve HTML class attributes as block attributes', | |
| 'nullable': True | |
| } | |
| } | |
| output_type = "string" | |
| # Mapping of HTML elements to WordPress block names | |
| BLOCK_MAPPINGS = { | |
| 'p': 'core/paragraph', | |
| 'h1': 'core/heading', | |
| 'h2': 'core/heading', | |
| 'h3': 'core/heading', | |
| 'h4': 'core/heading', | |
| 'h5': 'core/heading', | |
| 'h6': 'core/heading', | |
| 'ul': 'core/list', | |
| 'ol': 'core/list', | |
| 'li': 'core/list-item', | |
| 'img': 'core/image', | |
| 'figure': 'core/image', | |
| 'blockquote': 'core/quote', | |
| 'pre': 'core/code', | |
| 'code': 'core/code', | |
| 'table': 'core/table', | |
| } | |
| def __init__(self): | |
| super().__init__() | |
| def _get_block_attributes(self, element) -> Dict: | |
| """Extract relevant attributes from HTML element for block attributes.""" | |
| attrs = {} | |
| # Handle heading levels | |
| if element.name.startswith('h') and element.name[1].isdigit(): | |
| attrs['level'] = int(element.name[1]) | |
| # Handle alignment | |
| if 'class' in element.attrs: | |
| classes = element['class'] | |
| alignments = ['alignleft', 'alignright', | |
| 'aligncenter', 'alignwide', 'alignfull'] | |
| for align in alignments: | |
| if align in classes: | |
| attrs['align'] = align.replace('align', '') | |
| # Handle images | |
| if element.name == 'img': | |
| attrs['url'] = element.get('src', '') | |
| if element.get('alt'): | |
| attrs['alt'] = element['alt'] | |
| return attrs | |
| def _element_to_block(self, element, preserve_classes: bool = False) -> str: | |
| """Convert a single HTML element to a WordPress block.""" | |
| if element.name not in self.BLOCK_MAPPINGS: | |
| return str(element) | |
| block_name = self.BLOCK_MAPPINGS[element.name] | |
| attrs = self._get_block_attributes(element) | |
| if preserve_classes and 'class' in element.attrs: | |
| attrs['className'] = ' '.join(element['class']) | |
| # Handle nested content | |
| inner_content = element.decode_contents().strip() if element.contents else "" | |
| # Create block comment wrapper | |
| block_start = f'<!-- wp:{block_name.replace("core/", "")}' | |
| if attrs: | |
| block_start += f' {json.dumps(attrs)}' | |
| block_start += ' -->' | |
| # Wrap content in appropriate HTML tag | |
| if element.name == 'p': | |
| content = f'<p>{inner_content}</p>' | |
| elif element.name.startswith('h'): | |
| level = attrs.get('level', int(element.name[1])) | |
| content = f'<{element.name} class="wp-block-heading">{inner_content}</{element.name}>' | |
| elif element.name == 'img': | |
| content = str(element) # Keep original img tag | |
| elif element.name in ['ul', 'ol']: | |
| content = f'<{element.name}>{inner_content}</{element.name}>' | |
| elif element.name == 'li': | |
| content = f'<li>{inner_content}</li>' | |
| elif element.name == 'blockquote': | |
| content = f'<blockquote class="wp-block-quote">{inner_content}</blockquote>' | |
| elif element.name in ['pre', 'code']: | |
| content = f'<{element.name}>{inner_content}</{element.name}>' | |
| else: | |
| content = inner_content | |
| block_end = f'<!-- /wp:{block_name.replace("core/", "")} -->' | |
| return f'{block_start}\n{content}\n{block_end}' | |
| def forward(self, html_content: str, preserve_classes: bool = False) -> str: | |
| """Transform HTML content into WordPress blocks | |
| Args: | |
| html_content: The HTML content to transform | |
| preserve_classes: Whether to preserve HTML class attributes | |
| Returns: | |
| String containing the WordPress block representation | |
| """ | |
| try: | |
| # Handle input that might be a dictionary | |
| if isinstance(html_content, dict): | |
| html_content = html_content.get('content', '') | |
| # Ensure html_content is a string | |
| html_content = str( | |
| html_content) if html_content is not None else "" | |
| # Remove DOCTYPE, html, head, body tags and their content | |
| html_content = re.sub(r'<!DOCTYPE[^>]*>', '', html_content) | |
| html_content = re.sub( | |
| r'<html[^>]*>.*?<body[^>]*>', '', html_content, flags=re.DOTALL) | |
| html_content = re.sub(r'</body>.*?</html>', | |
| '', html_content, flags=re.DOTALL) | |
| # Create BeautifulSoup object with error handling | |
| soup = BeautifulSoup( | |
| html_content, 'html.parser', from_encoding='utf-8') | |
| # Remove style tags and their content | |
| for style in soup.find_all('style'): | |
| style.decompose() | |
| # Remove container divs but keep their content | |
| for div in soup.find_all('div', class_='container'): | |
| div.unwrap() | |
| # Remove the first h1 tag as it's used as the post title | |
| first_h1 = soup.find('h1') | |
| if first_h1: | |
| first_h1.decompose() | |
| blocks = [] | |
| found_first_paragraph = False | |
| # Process each top-level element | |
| for element in soup.find_all(recursive=False): | |
| if element.name: # Skip NavigableString objects | |
| try: | |
| block = self._element_to_block( | |
| element, preserve_classes) | |
| blocks.append(block) | |
| # Insert "more" block after first paragraph | |
| if not found_first_paragraph and element.name == 'p': | |
| found_first_paragraph = True | |
| blocks.append( | |
| '<!-- wp:more -->\n<!--more-->\n<!-- /wp:more -->') | |
| except Exception as e: | |
| print( | |
| f"Warning: Failed to process element {element.name}: {str(e)}") | |
| # Fallback to string representation | |
| blocks.append(str(element)) | |
| return '\n\n'.join(blocks) | |
| except Exception as e: | |
| print(f"Error converting HTML to blocks: {str(e)}") | |
| # Return sanitized original content as fallback | |
| if isinstance(html_content, dict): | |
| html_content = str(html_content.get('content', '')) | |
| return html_content.replace('<', '<').replace('>', '>') | |