// TODO: refactor import { AnyNode, Cheerio, load } from "cheerio"; // rustified import { ScrapeOptions } from "../../../controllers/v1/types"; import { transformHtml } from "../../../lib/html-transformer"; import { logger } from "../../../lib/logger"; const excludeNonMainTags = [ "header", "footer", "nav", "aside", ".header", ".top", ".navbar", "#header", ".footer", ".bottom", "#footer", ".sidebar", ".side", ".aside", "#sidebar", ".modal", ".popup", "#modal", ".overlay", ".ad", ".ads", ".advert", "#ad", ".lang-selector", ".language", "#language-selector", ".social", ".social-media", ".social-links", "#social", ".menu", ".navigation", "#nav", ".breadcrumbs", "#breadcrumbs", ".share", "#share", ".widget", "#widget", ".cookie", "#cookie", ]; const forceIncludeMainTags = [ "#main", ".swoogo-cols", ".swoogo-text", ".swoogo-table-div", ".swoogo-space", ".swoogo-alert", ".swoogo-sponsors", ".swoogo-title", ".swoogo-tabs", ".swoogo-logo", ".swoogo-image", ".swoogo-button", ".swoogo-agenda" ]; export const htmlTransform = async ( html: string, url: string, scrapeOptions: ScrapeOptions, ) => { try { return await transformHtml({ html, url, include_tags: (scrapeOptions.includeTags ?? []).map(x => x.trim()).filter((x) => x.length !== 0), exclude_tags: (scrapeOptions.excludeTags ?? []).map(x => x.trim()).filter((x) => x.length !== 0), only_main_content: scrapeOptions.onlyMainContent, }) } catch (error) { logger.warn("Failed to call html-transformer! Falling back to cheerio...", { error, module: "scrapeURL", method: "extractLinks" }); } let soup = load(html); // remove unwanted elements if ( scrapeOptions.includeTags && scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0 ) { // Create a new root element to hold the tags to keep const newRoot = load("
")("div"); scrapeOptions.includeTags.forEach((tag) => { soup(tag).each((_, element) => { newRoot.append(soup(element).clone()); }); }); soup = load(newRoot.html() ?? ""); } soup("script, style, noscript, meta, head").remove(); if ( scrapeOptions.excludeTags && scrapeOptions.excludeTags.filter((x) => x.trim().length !== 0).length > 0 ) { scrapeOptions.excludeTags.forEach((tag) => { let elementsToRemove: Cheerio