docsifer / tests /unit /test_html_cleaner.py
lamhieu's picture
chore: update something
6ed4967
Raw
History Blame Contribute Delete
1.1 kB
from docsifer.core.html_cleaner import clean_html
def test_removes_style_script_noscript() -> None:
html = (
"<html><head><style>body{}</style><script>x()</script></head>"
"<body><noscript>nope</noscript><p>hi</p></body></html>"
)
out = clean_html(html)
assert "<style" not in out.lower()
assert "<script" not in out.lower()
assert "<noscript" not in out.lower()
assert "hi" in out
def test_removes_hidden_attribute_and_inline_styles() -> None:
html = (
"<div hidden>secret</div>"
'<div style="display:none">also-hidden</div>'
'<div style="display: none;">spaced</div>'
'<div aria-hidden="true">aria</div>'
"<p>visible</p>"
)
out = clean_html(html)
assert "secret" not in out
assert "also-hidden" not in out
assert "spaced" not in out
assert "aria" not in out
assert "visible" in out
def test_empty_input_returns_empty() -> None:
assert clean_html("") == ""
def test_malformed_html_does_not_raise() -> None:
out = clean_html("<<<<><><>")
assert isinstance(out, str)