import logging from typing import Any, Dict, List, Tuple from bs4 import BeautifulSoup, NavigableString, Tag logger = logging.getLogger(__name__) class HTMLProcessor: """ A processor for HTML content that preserves exact HTML structure while only translating text content. """ def __init__(self): self.skip_translation_class = 'notranslate' self.skip_tags = { 'script', 'style', 'pre', 'code', 'head', 'title', 'meta', 'link', 'iframe', 'noscript', 'svg', 'path', 'img' } def extract_text(self, html_content: str) -> Tuple[List[str], Dict[str, Any]]: """ Extract translatable text nodes from HTML content while preserving exact structure. Args: html_content: HTML content as a string Returns: A tuple containing: - List of text fragments to translate - DOM map that maintains references to the exact nodes in the original structure """ try: soup = BeautifulSoup(html_content, 'html.parser') text_fragments = [] dom_map = {} self._extract_text_from_node(soup, text_fragments, dom_map) return text_fragments, {'soup': soup, 'node_map': dom_map} except Exception as e: logger.error(f"Error extracting text from HTML: {str(e)}") return [], {} def _extract_text_from_node(self, node, text_fragments: List[str], dom_map: Dict[int, Any], path: str = ""): """ Recursively extract text from nodes while maintaining exact structure. Args: node: The current BeautifulSoup node text_fragments: List to store extracted text dom_map: Dictionary to map indices to nodes path: Current path in the DOM tree for debugging """ if isinstance(node, Tag) and node.name in self.skip_tags: return if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'): return if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags: text = str(node).strip() if text: index = len(text_fragments) text_fragments.append(text) dom_map[index] = node if isinstance(node, Tag): for child in node.children: child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path self._extract_text_from_node(child, text_fragments, dom_map, child_path) def replace_text(self, dom_data: Dict[str, Any], translated_fragments: List[str]) -> str: """ Replace the original text with translated text while keeping exact HTML structure. Args: dom_data: DOM data containing soup and node map translated_fragments: List of translated text fragments Returns: HTML content with translated text and preserved structure """ try: soup = dom_data.get('soup') node_map = dom_data.get('node_map', {}) if not soup or not node_map: logger.error("Invalid DOM data for text replacement") return "" for index, node in node_map.items(): if index < len(translated_fragments): node.replace_with(NavigableString(translated_fragments[index])) return str(soup) except Exception as e: logger.error(f"Error replacing text in HTML: {str(e)}") return "" def prepare_fragments_with_token(self, fragments: List[str], special_token: str) -> List[str]: """ Prepare text fragments by adding special language token to each fragment. Args: fragments: List of text fragments special_token: Special language token to add (e.g., '>>tam<<') Returns: List of fragments with token added """ if not special_token: return fragments prepared_fragments = [] for fragment in fragments: if fragment.strip(): prepared_fragments.append(f"{special_token}{fragment}") else: prepared_fragments.append(fragment) return prepared_fragments