Spaces:

re-mind
/

Crawl4AI

Paused

Crawl4AI / crawl4ai /html2text /utils.py

amaye15

test

03c0888 11 months ago

8.57 kB

	import html.entities
	from typing import Dict, List, Optional

	from . import config

	unifiable_n = {
	html.entities.name2codepoint[k]: v
	for k, v in config.UNIFIABLE.items()
	if k != "nbsp"
	}


	def hn(tag: str) -> int:
	if tag[0] == "h" and len(tag) == 2:
	n = tag[1]
	if "0" < n <= "9":
	return int(n)
	return 0


	def dumb_property_dict(style: str) -> Dict[str, str]:
	"""
	:returns: A hash of css attributes
	"""
	return {
	x.strip().lower(): y.strip().lower()
	for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
	}


	def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
	"""
	:type data: str

	:returns: A hash of css selectors, each of which contains a hash of
	css attributes.
	:rtype: dict
	"""
	# remove @import sentences
	data += ";"
	importIndex = data.find("@import")
	while importIndex != -1:
	data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
	importIndex = data.find("@import")

	# parse the css. reverted from dictionary comprehension in order to
	# support older pythons
	pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
	try:
	elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
	except ValueError:
	elements = {} # not that important

	return elements


	def element_style(
	attrs: Dict[str, Optional[str]],
	style_def: Dict[str, Dict[str, str]],
	parent_style: Dict[str, str],
	) -> Dict[str, str]:
	"""
	:type attrs: dict
	:type style_def: dict
	:type style_def: dict

	:returns: A hash of the 'final' style attributes of the element
	:rtype: dict
	"""
	style = parent_style.copy()
	if "class" in attrs:
	assert attrs["class"] is not None
	for css_class in attrs["class"].split():
	css_style = style_def.get("." + css_class, {})
	style.update(css_style)
	if "style" in attrs:
	assert attrs["style"] is not None
	immediate_style = dumb_property_dict(attrs["style"])
	style.update(immediate_style)

	return style


	def google_list_style(style: Dict[str, str]) -> str:
	"""
	Finds out whether this is an ordered or unordered list

	:type style: dict

	:rtype: str
	"""
	if "list-style-type" in style:
	list_style = style["list-style-type"]
	if list_style in ["disc", "circle", "square", "none"]:
	return "ul"

	return "ol"


	def google_has_height(style: Dict[str, str]) -> bool:
	"""
	Check if the style of the element has the 'height' attribute
	explicitly defined

	:type style: dict

	:rtype: bool
	"""
	return "height" in style


	def google_text_emphasis(style: Dict[str, str]) -> List[str]:
	"""
	:type style: dict

	:returns: A list of all emphasis modifiers of the element
	:rtype: list
	"""
	emphasis = []
	if "text-decoration" in style:
	emphasis.append(style["text-decoration"])
	if "font-style" in style:
	emphasis.append(style["font-style"])
	if "font-weight" in style:
	emphasis.append(style["font-weight"])

	return emphasis


	def google_fixed_width_font(style: Dict[str, str]) -> bool:
	"""
	Check if the css of the current element defines a fixed width font

	:type style: dict

	:rtype: bool
	"""
	font_family = ""
	if "font-family" in style:
	font_family = style["font-family"]
	return "courier new" == font_family or "consolas" == font_family


	def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
	"""
	Extract numbering from list element attributes

	:type attrs: dict

	:rtype: int or None
	"""
	if "start" in attrs:
	assert attrs["start"] is not None
	try:
	return int(attrs["start"]) - 1
	except ValueError:
	pass

	return 0


	def skipwrap(
	para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
	) -> bool:
	# If it appears to contain a link
	# don't wrap
	if not wrap_links and config.RE_LINK.search(para):
	return True
	# If the text begins with four spaces or one tab, it's a code block;
	# don't wrap
	if para[0:4] == " " or para[0] == "\t":
	return True

	# If the text begins with only two "--", possibly preceded by
	# whitespace, that's an emdash; so wrap.
	stripped = para.lstrip()
	if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
	return False

	# I'm not sure what this is for; I thought it was to detect lists,
	# but there's a <br>-inside-<span> case in one of the tests that
	# also depends upon it.
	if stripped[0:1] in ("-", "") and not stripped[0:2] == "*":
	return not wrap_list_items

	# If text contains a pipe character it is likely a table
	if not wrap_tables and config.RE_TABLE.search(para):
	return True

	# If the text begins with a single -, *, or +, followed by a space,
	# or an integer, followed by a ., followed by a space (in either
	# case optionally proceeded by whitespace), it's a list; don't wrap.
	return bool(
	config.RE_ORDERED_LIST_MATCHER.match(stripped)
	or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
	)


	def escape_md(text: str) -> str:
	"""
	Escapes markdown-sensitive characters within other markdown
	constructs.
	"""
	return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)


	def escape_md_section(
	text: str,
	escape_backslash: bool = True,
	snob: bool = False,
	escape_dot: bool = True,
	escape_plus: bool = True,
	escape_dash: bool = True
	) -> str:
	"""
	Escapes markdown-sensitive characters across whole document sections.
	Each escaping operation can be controlled individually.
	"""
	if escape_backslash:
	text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)

	if snob:
	text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)

	if escape_dot:
	text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)

	if escape_plus:
	text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)

	if escape_dash:
	text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)

	return text

	def reformat_table(lines: List[str], right_margin: int) -> List[str]:
	"""
	Given the lines of a table
	padds the cells and returns the new lines
	"""
	# find the maximum width of the columns
	max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("\|")]
	max_cols = len(max_width)
	for line in lines:
	cols = [x.rstrip() for x in line.split("\|")]
	num_cols = len(cols)

	# don't drop any data if colspan attributes result in unequal lengths
	if num_cols < max_cols:
	cols += [""] * (max_cols - num_cols)
	elif max_cols < num_cols:
	max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
	max_cols = num_cols

	max_width = [
	max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
	]

	# reformat
	new_lines = []
	for line in lines:
	cols = [x.rstrip() for x in line.split("\|")]
	if set(line.strip()) == set("-\|"):
	filler = "-"
	new_cols = [
	x.rstrip() + (filler * (M - len(x.rstrip())))
	for x, M in zip(cols, max_width)
	]
	new_lines.append("\|-" + "\|".join(new_cols) + "\|")
	else:
	filler = " "
	new_cols = [
	x.rstrip() + (filler * (M - len(x.rstrip())))
	for x, M in zip(cols, max_width)
	]
	new_lines.append("\| " + "\|".join(new_cols) + "\|")
	return new_lines


	def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
	"""
	Provide padding for tables in the text
	"""
	lines = text.split("\n")
	table_buffer = [] # type: List[str]
	table_started = False
	new_lines = []
	for line in lines:
	# Toggle table started
	if config.TABLE_MARKER_FOR_PAD in line:
	table_started = not table_started
	if not table_started:
	table = reformat_table(table_buffer, right_margin)
	new_lines.extend(table)
	table_buffer = []
	new_lines.append("")
	continue
	# Process lines
	if table_started:
	table_buffer.append(line)
	else:
	new_lines.append(line)
	return "\n".join(new_lines)