lt_space / app /models /html_processor.py
Arsive2's picture
updated api server
8720cc4
import logging
from typing import Any, Dict, List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag
logger = logging.getLogger(__name__)
class HTMLProcessor:
"""
A processor for HTML content that preserves exact HTML structure
while only translating text content.
"""
def __init__(self):
self.skip_translation_class = 'notranslate'
self.skip_tags = {
'script', 'style', 'pre', 'code', 'head', 'title', 'meta',
'link', 'iframe', 'noscript', 'svg', 'path', 'img'
}
def extract_text(self, html_content: str) -> Tuple[List[str], Dict[str, Any]]:
"""
Extract translatable text nodes from HTML content while preserving exact structure.
Args:
html_content: HTML content as a string
Returns:
A tuple containing:
- List of text fragments to translate
- DOM map that maintains references to the exact nodes in the original structure
"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
text_fragments = []
dom_map = {}
self._extract_text_from_node(soup, text_fragments, dom_map)
return text_fragments, {'soup': soup, 'node_map': dom_map}
except Exception as e:
logger.error(f"Error extracting text from HTML: {str(e)}")
return [], {}
def _extract_text_from_node(self, node, text_fragments: List[str], dom_map: Dict[int, Any], path: str = ""):
"""
Recursively extract text from nodes while maintaining exact structure.
Args:
node: The current BeautifulSoup node
text_fragments: List to store extracted text
dom_map: Dictionary to map indices to nodes
path: Current path in the DOM tree for debugging
"""
if isinstance(node, Tag) and node.name in self.skip_tags:
return
if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
return
if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
text = str(node).strip()
if text:
index = len(text_fragments)
text_fragments.append(text)
dom_map[index] = node
if isinstance(node, Tag):
for child in node.children:
child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
self._extract_text_from_node(child, text_fragments, dom_map, child_path)
def replace_text(self, dom_data: Dict[str, Any], translated_fragments: List[str]) -> str:
"""
Replace the original text with translated text while keeping exact HTML structure.
Args:
dom_data: DOM data containing soup and node map
translated_fragments: List of translated text fragments
Returns:
HTML content with translated text and preserved structure
"""
try:
soup = dom_data.get('soup')
node_map = dom_data.get('node_map', {})
if not soup or not node_map:
logger.error("Invalid DOM data for text replacement")
return ""
for index, node in node_map.items():
if index < len(translated_fragments):
node.replace_with(NavigableString(translated_fragments[index]))
return str(soup)
except Exception as e:
logger.error(f"Error replacing text in HTML: {str(e)}")
return ""
def prepare_fragments_with_token(self, fragments: List[str], special_token: str) -> List[str]:
"""
Prepare text fragments by adding special language token to each fragment.
Args:
fragments: List of text fragments
special_token: Special language token to add (e.g., '>>tam<<')
Returns:
List of fragments with token added
"""
if not special_token:
return fragments
prepared_fragments = []
for fragment in fragments:
if fragment.strip():
prepared_fragments.append(f"{special_token}{fragment}")
else:
prepared_fragments.append(fragment)
return prepared_fragments