Spaces:

Arsive
/

lt_space

Sleeping

App Files Files Community

lt_space / app /models /html_processor.py

Arsive2

updated api server

8720cc4 9 months ago

raw

history blame contribute delete

4.64 kB

	import logging
	from typing import Any, Dict, List, Tuple

	from bs4 import BeautifulSoup, NavigableString, Tag

	logger = logging.getLogger(__name__)

	class HTMLProcessor:
	"""
	A processor for HTML content that preserves exact HTML structure
	while only translating text content.
	"""

	def __init__(self):
	self.skip_translation_class = 'notranslate'
	self.skip_tags = {
	'script', 'style', 'pre', 'code', 'head', 'title', 'meta',
	'link', 'iframe', 'noscript', 'svg', 'path', 'img'
	}

	def extract_text(self, html_content: str) -> Tuple[List[str], Dict[str, Any]]:
	"""
	Extract translatable text nodes from HTML content while preserving exact structure.

	Args:
	html_content: HTML content as a string

	Returns:
	A tuple containing:
	- List of text fragments to translate
	- DOM map that maintains references to the exact nodes in the original structure
	"""
	try:
	soup = BeautifulSoup(html_content, 'html.parser')

	text_fragments = []
	dom_map = {}

	self._extract_text_from_node(soup, text_fragments, dom_map)

	return text_fragments, {'soup': soup, 'node_map': dom_map}

	except Exception as e:
	logger.error(f"Error extracting text from HTML: {str(e)}")
	return [], {}

	def _extract_text_from_node(self, node, text_fragments: List[str], dom_map: Dict[int, Any], path: str = ""):
	"""
	Recursively extract text from nodes while maintaining exact structure.

	Args:
	node: The current BeautifulSoup node
	text_fragments: List to store extracted text
	dom_map: Dictionary to map indices to nodes
	path: Current path in the DOM tree for debugging
	"""
	if isinstance(node, Tag) and node.name in self.skip_tags:
	return

	if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
	return

	if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
	text = str(node).strip()
	if text:
	index = len(text_fragments)
	text_fragments.append(text)
	dom_map[index] = node

	if isinstance(node, Tag):
	for child in node.children:
	child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
	self._extract_text_from_node(child, text_fragments, dom_map, child_path)

	def replace_text(self, dom_data: Dict[str, Any], translated_fragments: List[str]) -> str:
	"""
	Replace the original text with translated text while keeping exact HTML structure.

	Args:
	dom_data: DOM data containing soup and node map
	translated_fragments: List of translated text fragments

	Returns:
	HTML content with translated text and preserved structure
	"""
	try:
	soup = dom_data.get('soup')
	node_map = dom_data.get('node_map', {})

	if not soup or not node_map:
	logger.error("Invalid DOM data for text replacement")
	return ""

	for index, node in node_map.items():
	if index < len(translated_fragments):
	node.replace_with(NavigableString(translated_fragments[index]))

	return str(soup)

	except Exception as e:
	logger.error(f"Error replacing text in HTML: {str(e)}")
	return ""

	def prepare_fragments_with_token(self, fragments: List[str], special_token: str) -> List[str]:
	"""
	Prepare text fragments by adding special language token to each fragment.

	Args:
	fragments: List of text fragments
	special_token: Special language token to add (e.g., '>>tam<<')

	Returns:
	List of fragments with token added
	"""
	if not special_token:
	return fragments

	prepared_fragments = []
	for fragment in fragments:
	if fragment.strip():
	prepared_fragments.append(f"{special_token}{fragment}")
	else:
	prepared_fragments.append(fragment)

	return prepared_fragments