Steel / api /src /utils /scrape /cleanHtml.ts
supernovagateway's picture
Upload folder using huggingface_hub
fb38ec5 verified
import { JSDOM } from "jsdom";
export const cleanHtml = (html: string): string => {
const blacklistedElements = new Set([
"head",
"title",
"meta",
"script",
"style",
"path",
"svg",
"br",
"hr",
"link",
"object",
"embed",
]);
const blacklistedAttributes = [
"style",
"ping",
"src",
"item.*",
"aria.*",
"js.*",
"data-.*",
"role",
"tabindex",
"onerror",
];
const dom = new JSDOM(html);
const document = dom.window.document;
// Remove blacklisted elements
blacklistedElements.forEach((tag) => {
const elements = document.querySelectorAll(tag);
elements.forEach((element) => {
element.remove();
});
});
// Remove blacklisted attributes
const elements = document.querySelectorAll("*");
elements.forEach((element) => {
blacklistedAttributes.forEach((attrPattern) => {
const regex = new RegExp(`^${attrPattern}$`);
Array.from(element.attributes).forEach((attr: any) => {
if (regex.test(attr.name)) {
element.removeAttribute(attr.name);
}
});
});
});
// Remove empty elements
elements.forEach((element) => {
if (!element.hasAttributes() && element.textContent?.trim() === "") {
element.remove();
}
});
const sourceCode = document.documentElement.outerHTML;
return sourceCode;
};