// data_extraction/clean_html.js
const cheerio = require("cheerio");
function cleanHTML(html) {
// Load HTML into cheerio
const $ = cheerio.load(html);
const kill = [
"script",
"style",
"nav",
"header",
"footer",
".ads",
".advertisement",
"#sidebar",
];
// Remove unwanted elements
kill.forEach(sel => $(sel).remove());
// Extract body text, normalize whitespace
let text = $("body").text();
return text.replace(/\s+/g, " ").trim();
}
module.exports = { cleanHTML };