// TODO: refactor import { load } from "cheerio"; // rustified import { logger } from "../../../lib/logger"; import { extractLinks as _extractLinks } from "../../../lib/html-transformer"; async function extractLinksRust(html: string, baseUrl: string): Promise { const hrefs = await _extractLinks(html); const links: string[] = []; hrefs.forEach(href => { href = href.trim(); try { if (href.startsWith("http://") || href.startsWith("https://")) { // Absolute URL, add as is links.push(href); } else if (href.startsWith("/")) { // Relative URL starting with '/', append to origin links.push(new URL(href, baseUrl).href); } else if (!href.startsWith("#") && !href.startsWith("mailto:")) { // Relative URL not starting with '/', append to base URL links.push(new URL(href, baseUrl).href); } else if (href.startsWith("mailto:")) { // mailto: links, add as is links.push(href); } // Fragment-only links (#) are ignored } catch (error) { logger.error( `Failed to construct URL for href: ${href} with base: ${baseUrl}`, { error }, ); } }); // Remove duplicates and return return [...new Set(links)]; } export async function extractLinks(html: string, baseUrl: string): Promise { try { return await extractLinksRust(html, baseUrl); } catch (error) { logger.warn("Failed to call html-transformer! Falling back to cheerio...", { error, module: "scrapeURL", method: "extractLinks" }); } const $ = load(html); const links: string[] = []; $("a").each((_, element) => { let href = $(element).attr("href"); if (href) { href = href.trim(); try { if (href.startsWith("http://") || href.startsWith("https://")) { // Absolute URL, add as is links.push(href); } else if (href.startsWith("/")) { // Relative URL starting with '/', append to origin links.push(new URL(href, baseUrl).href); } else if (!href.startsWith("#") && !href.startsWith("mailto:")) { // Relative URL not starting with '/', append to base URL links.push(new URL(href, baseUrl).href); } else if (href.startsWith("mailto:")) { // mailto: links, add as is links.push(href); } // Fragment-only links (#) are ignored } catch (error) { logger.error( `Failed to construct URL for href: ${href} with base: ${baseUrl}`, { error }, ); } } }); // Remove duplicates and return return [...new Set(links)]; }