const { cleanHTML } = require("../../data_extraction/clean_html"); describe("cleanHTML", () => { test("removes scripts, styles, headers, footers, ads", () => { const html = `
HEADER
Buy NOW

Hello world

`; const out = cleanHTML(html); expect(out).toContain("Hello world"); expect(out).not.toMatch(/HEADER|FOOTER|Buy NOW|alert/); }); test("collapses whitespace", () => { const html = `

Text

\n\n

More

`; const out = cleanHTML(html); expect(out).toBe("Text More"); }); });