Spaces:
Paused
Paused
File size: 2,670 Bytes
0e759d2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | // TODO: refactor
import { load } from "cheerio"; // rustified
import { logger } from "../../../lib/logger";
import { extractLinks as _extractLinks } from "../../../lib/html-transformer";
async function extractLinksRust(html: string, baseUrl: string): Promise<string[]> {
const hrefs = await _extractLinks(html);
const links: string[] = [];
hrefs.forEach(href => {
href = href.trim();
try {
if (href.startsWith("http://") || href.startsWith("https://")) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith("/")) {
// Relative URL starting with '/', append to origin
links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith("#") && !href.startsWith("mailto:")) {
// Relative URL not starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (href.startsWith("mailto:")) {
// mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
} catch (error) {
logger.error(
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
{ error },
);
}
});
// Remove duplicates and return
return [...new Set(links)];
}
export async function extractLinks(html: string, baseUrl: string): Promise<string[]> {
try {
return await extractLinksRust(html, baseUrl);
} catch (error) {
logger.warn("Failed to call html-transformer! Falling back to cheerio...", {
error,
module: "scrapeURL", method: "extractLinks"
});
}
const $ = load(html);
const links: string[] = [];
$("a").each((_, element) => {
let href = $(element).attr("href");
if (href) {
href = href.trim();
try {
if (href.startsWith("http://") || href.startsWith("https://")) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith("/")) {
// Relative URL starting with '/', append to origin
links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith("#") && !href.startsWith("mailto:")) {
// Relative URL not starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (href.startsWith("mailto:")) {
// mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
} catch (error) {
logger.error(
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
{ error },
);
}
}
});
// Remove duplicates and return
return [...new Set(links)];
}
|