| import logging | |
| from typing import Any, Dict, List, Tuple | |
| from bs4 import BeautifulSoup, NavigableString, Tag | |
| logger = logging.getLogger(__name__) | |
| class HTMLProcessor: | |
| """ | |
| A processor for HTML content that preserves exact HTML structure | |
| while only translating text content. | |
| """ | |
| def __init__(self): | |
| self.skip_translation_class = 'notranslate' | |
| self.skip_tags = { | |
| 'script', 'style', 'pre', 'code', 'head', 'title', 'meta', | |
| 'link', 'iframe', 'noscript', 'svg', 'path', 'img' | |
| } | |
| def extract_text(self, html_content: str) -> Tuple[List[str], Dict[str, Any]]: | |
| """ | |
| Extract translatable text nodes from HTML content while preserving exact structure. | |
| Args: | |
| html_content: HTML content as a string | |
| Returns: | |
| A tuple containing: | |
| - List of text fragments to translate | |
| - DOM map that maintains references to the exact nodes in the original structure | |
| """ | |
| try: | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| text_fragments = [] | |
| dom_map = {} | |
| self._extract_text_from_node(soup, text_fragments, dom_map) | |
| return text_fragments, {'soup': soup, 'node_map': dom_map} | |
| except Exception as e: | |
| logger.error(f"Error extracting text from HTML: {str(e)}") | |
| return [], {} | |
| def _extract_text_from_node(self, node, text_fragments: List[str], dom_map: Dict[int, Any], path: str = ""): | |
| """ | |
| Recursively extract text from nodes while maintaining exact structure. | |
| Args: | |
| node: The current BeautifulSoup node | |
| text_fragments: List to store extracted text | |
| dom_map: Dictionary to map indices to nodes | |
| path: Current path in the DOM tree for debugging | |
| """ | |
| if isinstance(node, Tag) and node.name in self.skip_tags: | |
| return | |
| if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'): | |
| return | |
| if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags: | |
| text = str(node).strip() | |
| if text: | |
| index = len(text_fragments) | |
| text_fragments.append(text) | |
| dom_map[index] = node | |
| if isinstance(node, Tag): | |
| for child in node.children: | |
| child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path | |
| self._extract_text_from_node(child, text_fragments, dom_map, child_path) | |
| def replace_text(self, dom_data: Dict[str, Any], translated_fragments: List[str]) -> str: | |
| """ | |
| Replace the original text with translated text while keeping exact HTML structure. | |
| Args: | |
| dom_data: DOM data containing soup and node map | |
| translated_fragments: List of translated text fragments | |
| Returns: | |
| HTML content with translated text and preserved structure | |
| """ | |
| try: | |
| soup = dom_data.get('soup') | |
| node_map = dom_data.get('node_map', {}) | |
| if not soup or not node_map: | |
| logger.error("Invalid DOM data for text replacement") | |
| return "" | |
| for index, node in node_map.items(): | |
| if index < len(translated_fragments): | |
| node.replace_with(NavigableString(translated_fragments[index])) | |
| return str(soup) | |
| except Exception as e: | |
| logger.error(f"Error replacing text in HTML: {str(e)}") | |
| return "" | |
| def prepare_fragments_with_token(self, fragments: List[str], special_token: str) -> List[str]: | |
| """ | |
| Prepare text fragments by adding special language token to each fragment. | |
| Args: | |
| fragments: List of text fragments | |
| special_token: Special language token to add (e.g., '>>tam<<') | |
| Returns: | |
| List of fragments with token added | |
| """ | |
| if not special_token: | |
| return fragments | |
| prepared_fragments = [] | |
| for fragment in fragments: | |
| if fragment.strip(): | |
| prepared_fragments.append(f"{special_token}{fragment}") | |
| else: | |
| prepared_fragments.append(fragment) | |
| return prepared_fragments |