const { cleanHTML } = require("../../data_extraction/clean_html"); describe("cleanHTML", () => { test("removes scripts, styles, headers, footers, ads", () => { const html = `
Hello world
`; const out = cleanHTML(html); expect(out).toContain("Hello world"); expect(out).not.toMatch(/HEADER|FOOTER|Buy NOW|alert/); }); test("collapses whitespace", () => { const html = `Text
\n\nMore
`; const out = cleanHTML(html); expect(out).toBe("Text More"); }); });