Spaces:
Running
Running
| // data_extraction/clean_html.js | |
| const cheerio = require("cheerio"); | |
| function cleanHTML(html) { | |
| // Load HTML into cheerio | |
| const $ = cheerio.load(html); | |
| const kill = [ | |
| "script", | |
| "style", | |
| "nav", | |
| "header", | |
| "footer", | |
| ".ads", | |
| ".advertisement", | |
| "#sidebar", | |
| ]; | |
| // Remove unwanted elements | |
| kill.forEach(sel => $(sel).remove()); | |
| // Extract body text, normalize whitespace | |
| let text = $("body").text(); | |
| return text.replace(/\s+/g, " ").trim(); | |
| } | |
| module.exports = { cleanHTML }; | |