File size: 4,640 Bytes
4d48d5a
d0d0352
 
4d48d5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8720cc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import logging
from typing import Any, Dict, List, Tuple

from bs4 import BeautifulSoup, NavigableString, Tag

logger = logging.getLogger(__name__)

class HTMLProcessor:
    """
    A processor for HTML content that preserves exact HTML structure
    while only translating text content.
    """
    
    def __init__(self):
        self.skip_translation_class = 'notranslate'
        self.skip_tags = {
            'script', 'style', 'pre', 'code', 'head', 'title', 'meta',
            'link', 'iframe', 'noscript', 'svg', 'path', 'img'
        }
        
    def extract_text(self, html_content: str) -> Tuple[List[str], Dict[str, Any]]:
        """
        Extract translatable text nodes from HTML content while preserving exact structure.
        
        Args:
            html_content: HTML content as a string
            
        Returns:
            A tuple containing:
            - List of text fragments to translate
            - DOM map that maintains references to the exact nodes in the original structure
        """
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            
            text_fragments = []
            dom_map = {}
            
            self._extract_text_from_node(soup, text_fragments, dom_map)
            
            return text_fragments, {'soup': soup, 'node_map': dom_map}
            
        except Exception as e:
            logger.error(f"Error extracting text from HTML: {str(e)}")
            return [], {}
    
    def _extract_text_from_node(self, node, text_fragments: List[str], dom_map: Dict[int, Any], path: str = ""):
        """
        Recursively extract text from nodes while maintaining exact structure.
        
        Args:
            node: The current BeautifulSoup node
            text_fragments: List to store extracted text
            dom_map: Dictionary to map indices to nodes
            path: Current path in the DOM tree for debugging
        """
        if isinstance(node, Tag) and node.name in self.skip_tags:
            return
            
        if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
            return
        
        if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
            text = str(node).strip()
            if text:
                index = len(text_fragments)
                text_fragments.append(text)
                dom_map[index] = node
        
        if isinstance(node, Tag):
            for child in node.children:
                child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
                self._extract_text_from_node(child, text_fragments, dom_map, child_path)
    
    def replace_text(self, dom_data: Dict[str, Any], translated_fragments: List[str]) -> str:
        """
        Replace the original text with translated text while keeping exact HTML structure.
        
        Args:
            dom_data: DOM data containing soup and node map
            translated_fragments: List of translated text fragments
            
        Returns:
            HTML content with translated text and preserved structure
        """
        try:
            soup = dom_data.get('soup')
            node_map = dom_data.get('node_map', {})
            
            if not soup or not node_map:
                logger.error("Invalid DOM data for text replacement")
                return ""
            
            for index, node in node_map.items():
                if index < len(translated_fragments):
                    node.replace_with(NavigableString(translated_fragments[index]))
            
            return str(soup)
            
        except Exception as e:
            logger.error(f"Error replacing text in HTML: {str(e)}")
            return ""
            
    def prepare_fragments_with_token(self, fragments: List[str], special_token: str) -> List[str]:
        """
        Prepare text fragments by adding special language token to each fragment.
        
        Args:
            fragments: List of text fragments
            special_token: Special language token to add (e.g., '>>tam<<')
            
        Returns:
            List of fragments with token added
        """
        if not special_token:
            return fragments
            
        prepared_fragments = []
        for fragment in fragments:
            if fragment.strip():
                prepared_fragments.append(f"{special_token}{fragment}")
            else:
                prepared_fragments.append(fragment)
                
        return prepared_fragments