personabot-api / tests /test_parser_sanitization.py
GitHub Actions
Deploy 1ba9ba6
9563e4a
raw
history blame contribute delete
882 Bytes
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[2]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from ingestion.parsers.readme_parser import parse_readme_bytes
from ingestion.parsers.text_sanitizer import strip_html_tags
def test_strip_html_tags_removes_img_and_comments() -> None:
text = "Hello <!-- comment --> <img src='x'> world <b>bold</b>"
cleaned = strip_html_tags(text)
assert "<img" not in cleaned
assert "<!--" not in cleaned
assert "<b>" not in cleaned
assert "Hello" in cleaned and "world" in cleaned
def test_parse_readme_bytes_removes_raw_html() -> None:
readme = b"# Repo\n\n<img src='banner.png'/>\n\nSome content"
parsed = parse_readme_bytes(readme, repo_name="1337Xcode/demo")
assert "<img" not in parsed["clean_content"]
assert "Some content" in parsed["clean_content"]