Spaces:
Running
Running
File size: 523 Bytes
a67789e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
// data_extraction/clean_html.js
const cheerio = require("cheerio");
function cleanHTML(html) {
// Load HTML into cheerio
const $ = cheerio.load(html);
const kill = [
"script",
"style",
"nav",
"header",
"footer",
".ads",
".advertisement",
"#sidebar",
];
// Remove unwanted elements
kill.forEach(sel => $(sel).remove());
// Extract body text, normalize whitespace
let text = $("body").text();
return text.replace(/\s+/g, " ").trim();
}
module.exports = { cleanHTML };
|