File size: 4,640 Bytes
4d48d5a d0d0352 4d48d5a 8720cc4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import logging
from typing import Any, Dict, List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag
logger = logging.getLogger(__name__)
class HTMLProcessor:
"""
A processor for HTML content that preserves exact HTML structure
while only translating text content.
"""
def __init__(self):
self.skip_translation_class = 'notranslate'
self.skip_tags = {
'script', 'style', 'pre', 'code', 'head', 'title', 'meta',
'link', 'iframe', 'noscript', 'svg', 'path', 'img'
}
def extract_text(self, html_content: str) -> Tuple[List[str], Dict[str, Any]]:
"""
Extract translatable text nodes from HTML content while preserving exact structure.
Args:
html_content: HTML content as a string
Returns:
A tuple containing:
- List of text fragments to translate
- DOM map that maintains references to the exact nodes in the original structure
"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
text_fragments = []
dom_map = {}
self._extract_text_from_node(soup, text_fragments, dom_map)
return text_fragments, {'soup': soup, 'node_map': dom_map}
except Exception as e:
logger.error(f"Error extracting text from HTML: {str(e)}")
return [], {}
def _extract_text_from_node(self, node, text_fragments: List[str], dom_map: Dict[int, Any], path: str = ""):
"""
Recursively extract text from nodes while maintaining exact structure.
Args:
node: The current BeautifulSoup node
text_fragments: List to store extracted text
dom_map: Dictionary to map indices to nodes
path: Current path in the DOM tree for debugging
"""
if isinstance(node, Tag) and node.name in self.skip_tags:
return
if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
return
if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
text = str(node).strip()
if text:
index = len(text_fragments)
text_fragments.append(text)
dom_map[index] = node
if isinstance(node, Tag):
for child in node.children:
child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
self._extract_text_from_node(child, text_fragments, dom_map, child_path)
def replace_text(self, dom_data: Dict[str, Any], translated_fragments: List[str]) -> str:
"""
Replace the original text with translated text while keeping exact HTML structure.
Args:
dom_data: DOM data containing soup and node map
translated_fragments: List of translated text fragments
Returns:
HTML content with translated text and preserved structure
"""
try:
soup = dom_data.get('soup')
node_map = dom_data.get('node_map', {})
if not soup or not node_map:
logger.error("Invalid DOM data for text replacement")
return ""
for index, node in node_map.items():
if index < len(translated_fragments):
node.replace_with(NavigableString(translated_fragments[index]))
return str(soup)
except Exception as e:
logger.error(f"Error replacing text in HTML: {str(e)}")
return ""
def prepare_fragments_with_token(self, fragments: List[str], special_token: str) -> List[str]:
"""
Prepare text fragments by adding special language token to each fragment.
Args:
fragments: List of text fragments
special_token: Special language token to add (e.g., '>>tam<<')
Returns:
List of fragments with token added
"""
if not special_token:
return fragments
prepared_fragments = []
for fragment in fragments:
if fragment.strip():
prepared_fragments.append(f"{special_token}{fragment}")
else:
prepared_fragments.append(fragment)
return prepared_fragments |