Spaces:

Echo-AI-official
/

Fire-crawl

Paused

App Files Files Community

Fire-crawl / src /scraper /WebScraper /crawler.ts

Echo-AI-official

Upload 280 files

0e759d2 verified 9 months ago

raw

history blame contribute delete

21.8 kB

	import axios, { AxiosError } from "axios";
	import { load } from "cheerio"; // rustified
	import { URL } from "url";
	import { getLinksFromSitemap } from "./sitemap";
	import robotsParser, { Robot } from "robots-parser";
	import { getURLDepth } from "./utils/maxDepthUtils";
	import { axiosTimeout } from "../../lib/timeout";
	import { logger as _logger } from "../../lib/logger";
	import https from "https";
	import { redisConnection } from "../../services/queue-service";
	import { extractLinks } from "../../lib/html-transformer";
	import { TimeoutSignal } from "../../controllers/v1/types";
	export class WebCrawler {
	private jobId: string;
	private initialUrl: string;
	private baseUrl: string;
	private includes: string[];
	private excludes: string[];
	private maxCrawledLinks: number;
	private maxCrawledDepth: number;
	private visited: Set<string> = new Set();
	private crawledUrls: Map<string, string> = new Map();
	private limit: number;
	private robotsTxtUrl: string;
	public robots: Robot;
	private generateImgAltText: boolean;
	private allowBackwardCrawling: boolean;
	private allowExternalContentLinks: boolean;
	private allowSubdomains: boolean;
	private ignoreRobotsTxt: boolean;
	private regexOnFullURL: boolean;
	private logger: typeof _logger;
	private sitemapsHit: Set<string> = new Set();
	private maxDiscoveryDepth: number \| undefined;
	private currentDiscoveryDepth: number;

	constructor({
	jobId,
	initialUrl,
	baseUrl,
	includes,
	excludes,
	maxCrawledLinks = 10000,
	limit = 10000,
	generateImgAltText = false,
	maxCrawledDepth = 10,
	allowBackwardCrawling = false,
	allowExternalContentLinks = false,
	allowSubdomains = false,
	ignoreRobotsTxt = false,
	regexOnFullURL = false,
	maxDiscoveryDepth,
	currentDiscoveryDepth,
	}: {
	jobId: string;
	initialUrl: string;
	baseUrl?: string;
	includes?: string[];
	excludes?: string[];
	maxCrawledLinks?: number;
	limit?: number;
	generateImgAltText?: boolean;
	maxCrawledDepth?: number;
	allowBackwardCrawling?: boolean;
	allowExternalContentLinks?: boolean;
	allowSubdomains?: boolean;
	ignoreRobotsTxt?: boolean;
	regexOnFullURL?: boolean;
	maxDiscoveryDepth?: number;
	currentDiscoveryDepth?: number;
	}) {
	this.jobId = jobId;
	this.initialUrl = initialUrl;
	this.baseUrl = baseUrl ?? new URL(initialUrl).origin;
	this.includes = Array.isArray(includes) ? includes : [];
	this.excludes = Array.isArray(excludes) ? excludes : [];
	this.limit = limit;
	this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`;
	this.robots = robotsParser(this.robotsTxtUrl, "");
	// Deprecated, use limit instead
	this.maxCrawledLinks = maxCrawledLinks ?? limit;
	this.maxCrawledDepth = maxCrawledDepth ?? 10;
	this.generateImgAltText = generateImgAltText ?? false;
	this.allowBackwardCrawling = allowBackwardCrawling ?? false;
	this.allowExternalContentLinks = allowExternalContentLinks ?? false;
	this.allowSubdomains = allowSubdomains ?? false;
	this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
	this.regexOnFullURL = regexOnFullURL ?? false;
	this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
	this.maxDiscoveryDepth = maxDiscoveryDepth;
	this.currentDiscoveryDepth = currentDiscoveryDepth ?? 0;
	}

	public filterLinks(
	sitemapLinks: string[],
	limit: number,
	maxDepth: number,
	fromMap: boolean = false,
	): string[] {
	if (this.currentDiscoveryDepth === this.maxDiscoveryDepth) {
	this.logger.debug("Max discovery depth hit, filtering off all links", { currentDiscoveryDepth: this.currentDiscoveryDepth, maxDiscoveryDepth: this.maxDiscoveryDepth });
	return [];
	}

	// If the initial URL is a sitemap.xml, skip filtering
	if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
	return sitemapLinks.slice(0, limit);
	}

	return sitemapLinks
	.filter((link) => {
	let url: URL;
	try {
	url = new URL(link.trim(), this.baseUrl);
	} catch (error) {
	this.logger.debug(`Error processing link: ${link}`, {
	link,
	error,
	method: "filterLinks",
	});
	return false;
	}
	const path = url.pathname;

	const depth = getURLDepth(url.toString());

	// Check if the link exceeds the maximum depth allowed
	if (depth > maxDepth) {
	if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
	this.logger.debug(`${link} DEPTH FAIL`);
	}
	return false;
	}

	const excincPath = this.regexOnFullURL ? link : path;

	// Check if the link should be excluded
	if (this.excludes.length > 0 && this.excludes[0] !== "") {
	if (
	this.excludes.some((excludePattern) =>
	new RegExp(excludePattern).test(excincPath),
	)
	) {
	if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
	this.logger.debug(`${link} EXCLUDE FAIL`);
	}
	return false;
	}
	}

	// Check if the link matches the include patterns, if any are specified
	if (this.includes.length > 0 && this.includes[0] !== "") {
	if (
	!this.includes.some((includePattern) =>
	new RegExp(includePattern).test(excincPath),
	)
	) {
	if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
	this.logger.debug(`${link} INCLUDE FAIL`);
	}
	return false;
	}
	}

	// Normalize the initial URL and the link to account for www and non-www versions
	const normalizedInitialUrl = new URL(this.initialUrl);
	let normalizedLink;
	try {
	normalizedLink = new URL(link);
	} catch (_) {
	if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
	this.logger.debug(`${link} URL PARSE FAIL`);
	}
	return false;
	}
	const initialHostname = normalizedInitialUrl.hostname.replace(
	/^www\./,
	"",
	);
	const linkHostname = normalizedLink.hostname.replace(/^www\./, "");

	// Ensure the protocol and hostname match, and the path starts with the initial URL's path
	// commented to able to handling external link on allowExternalContentLinks
	// if (linkHostname !== initialHostname) {
	// return false;
	// }

	if (!this.allowBackwardCrawling) {
	if (
	!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
	) {
	if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
	this.logger.debug(`${link} BACKWARDS FAIL ${normalizedLink.pathname} ${normalizedInitialUrl.pathname}`);
	}
	return false;
	}
	}

	const isAllowed = this.ignoreRobotsTxt
	? true
	: ((this.robots.isAllowed(link, "FireCrawlAgent") \|\| this.robots.isAllowed(link, "FirecrawlAgent")) ?? true);
	// Check if the link is disallowed by robots.txt
	if (!isAllowed) {
	this.logger.debug(`Link disallowed by robots.txt: ${link}`, {
	method: "filterLinks",
	link,
	});
	if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
	this.logger.debug(`${link} ROBOTS FAIL`);
	}
	return false;
	}

	if (this.isFile(link)) {
	if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
	this.logger.debug(`${link} FILE FAIL`);
	}
	return false;
	}

	if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
	this.logger.debug(`${link} OK`);
	}
	return true;
	})
	.slice(0, limit);
	}

	public async getRobotsTxt(skipTlsVerification = false, abort?: AbortSignal): Promise<string> {
	let extraArgs = {};
	if (skipTlsVerification) {
	extraArgs["httpsAgent"] = new https.Agent({
	rejectUnauthorized: false,
	});
	}
	const response = await axios.get(this.robotsTxtUrl, {
	timeout: axiosTimeout,
	signal: abort,
	...extraArgs,
	});
	return response.data;
	}

	public importRobotsTxt(txt: string) {
	this.robots = robotsParser(this.robotsTxtUrl, txt);
	}

	public async tryGetSitemap(
	urlsHandler: (urls: string[]) => unknown,
	fromMap: boolean = false,
	onlySitemap: boolean = false,
	timeout: number = 120000,
	abort?: AbortSignal,
	mock?: string,
	): Promise<number> {
	this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
	method: "tryGetSitemap",
	});
	let leftOfLimit = this.limit;

	const normalizeUrl = (url: string) => {
	url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
	if (url.endsWith("/")) {
	url = url.slice(0, -1);
	}
	return url;
	};

	const _urlsHandler = async (urls: string[]) => {
	if (fromMap && onlySitemap) {
	return urlsHandler(urls);
	} else {
	let filteredLinks = this.filterLinks(
	[...new Set(urls)].filter(x => this.filterURL(x, this.initialUrl) !== null),
	leftOfLimit,
	this.maxCrawledDepth,
	fromMap,
	);
	leftOfLimit -= filteredLinks.length;
	let uniqueURLs: string[] = [];
	for (const url of filteredLinks) {
	if (
	await redisConnection.sadd(
	"sitemap:" + this.jobId + ":links",
	normalizeUrl(url),
	)
	) {
	uniqueURLs.push(url);
	}
	}

	await redisConnection.expire(
	"sitemap:" + this.jobId + ":links",
	3600,
	"NX",
	);
	if (uniqueURLs.length > 0) {
	return urlsHandler(uniqueURLs);
	}
	}
	};

	const timeoutPromise = new Promise((_, reject) => {
	setTimeout(() => reject(new Error("Sitemap fetch timeout")), timeout);
	});

	try {
	let count = (await Promise.race([
	Promise.all([
	this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort, mock),
	...this.robots
	.getSitemaps()
	.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort, mock)),
	]).then((results) => results.reduce((a, x) => a + x, 0)),
	timeoutPromise,
	])) as number;

	if (count > 0) {
	if (
	await redisConnection.sadd(
	"sitemap:" + this.jobId + ":links",
	normalizeUrl(this.initialUrl),
	)
	) {
	urlsHandler([this.initialUrl]);
	}
	count++;
	}

	return count;
	} catch (error) {
	if (error.message === "Sitemap fetch timeout") {
	this.logger.warn("Sitemap fetch timed out", {
	method: "tryGetSitemap",
	timeout,
	});
	return 0;
	}
	this.logger.error("Error fetching sitemap", {
	method: "tryGetSitemap",
	error,
	});
	return 0;
	}
	}

	public filterURL(href: string, url: string): string \| null {
	let fullUrl = href;
	if (!href.startsWith("http")) {
	try {
	fullUrl = new URL(href, url).toString();
	} catch (_) {
	return null;
	}
	}
	let urlObj;
	try {
	urlObj = new URL(fullUrl);
	} catch (_) {
	return null;
	}
	const path = urlObj.pathname;

	if (this.isInternalLink(fullUrl)) {
	// INTERNAL LINKS
	if (
	this.isInternalLink(fullUrl) &&
	this.noSections(fullUrl) &&
	!this.matchesExcludes(path) &&
	this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
	) {
	return fullUrl;
	} else if (
	this.isInternalLink(fullUrl) &&
	this.noSections(fullUrl) &&
	!this.matchesExcludes(path) &&
	!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
	) {
	(async () => {
	await redisConnection.sadd(
	"crawl:" + this.jobId + ":robots_blocked",
	fullUrl,
	);
	await redisConnection.expire(
	"crawl:" + this.jobId + ":robots_blocked",
	24 * 60 * 60,
	);
	})();
	}
	} else {
	// EXTERNAL LINKS
	if (
	this.isInternalLink(url) &&
	this.allowExternalContentLinks &&
	!this.isSocialMediaOrEmail(fullUrl) &&
	!this.matchesExcludes(fullUrl, true) &&
	!this.isExternalMainPage(fullUrl)
	) {
	return fullUrl;
	}
	}

	if (
	this.allowSubdomains &&
	!this.isSocialMediaOrEmail(fullUrl) &&
	this.isSubdomain(fullUrl)
	) {
	return fullUrl;
	}

	return null;
	}

	private async extractLinksFromHTMLRust(html: string, url: string) {
	return (await extractLinks(html)).filter(x => this.filterURL(x, url));
	}

	private extractLinksFromHTMLCheerio(html: string, url: string) {
	let links: string[] = [];

	const $ = load(html);
	$("a").each((_, element) => {
	let href = $(element).attr("href");
	if (href) {
	if (href.match(/^https?:\/[^\/]/)) {
	href = href.replace(/^https?:\//, "$&/");
	}
	const u = this.filterURL(href, url);
	if (u !== null) {
	links.push(u);
	}
	}
	});

	// Extract links from iframes with inline src
	$("iframe").each((_, element) => {
	const src = $(element).attr("src");
	if (src && src.startsWith("data:text/html")) {
	const iframeHtml = decodeURIComponent(src.split(",")[1]);
	const iframeLinks = this.extractLinksFromHTMLCheerio(iframeHtml, url);
	links = links.concat(iframeLinks);
	}
	});

	return links;
	}

	public async extractLinksFromHTML(html: string, url: string) {
	try {
	return [...new Set((await this.extractLinksFromHTMLRust(html, url)).map(x => {
	try {
	return new URL(x, url).href
	} catch (e) {
	return null;
	}
	}).filter(x => x !== null) as string[])];
	} catch (error) {
	this.logger.warn("Failed to call html-transformer! Falling back to cheerio...", {
	error,
	module: "scrapeURL", method: "extractMetadata"
	});
	}

	return this.extractLinksFromHTMLCheerio(html, url);
	}

	private isRobotsAllowed(
	url: string,
	ignoreRobotsTxt: boolean = false,
	): boolean {
	return ignoreRobotsTxt
	? true
	: this.robots
	? ((this.robots.isAllowed(url, "FireCrawlAgent") \|\| this.robots.isAllowed(url, "FirecrawlAgent")) ?? true)
	: true;
	}

	private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
	return this.excludes.some((pattern) => {
	if (onlyDomains) return this.matchesExcludesExternalDomains(url);

	return this.excludes.some((pattern) => new RegExp(pattern).test(url));
	});
	}

	// supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
	private matchesExcludesExternalDomains(url: string) {
	try {
	const urlObj = new URL(url);
	const hostname = urlObj.hostname;
	const pathname = urlObj.pathname;

	for (let domain of this.excludes) {
	let domainObj = new URL("http://" + domain.replace(/^https?:\/\//, ""));
	let domainHostname = domainObj.hostname;
	let domainPathname = domainObj.pathname;

	if (
	hostname === domainHostname \|\|
	hostname.endsWith(`.${domainHostname}`)
	) {
	if (pathname.startsWith(domainPathname)) {
	return true;
	}
	}
	}
	return false;
	} catch (e) {
	return false;
	}
	}

	private isExternalMainPage(url: string): boolean {
	return !Boolean(
	url
	.split("/")
	.slice(3)
	.filter((subArray) => subArray.length > 0).length,
	);
	}

	private noSections(link: string): boolean {
	return !link.includes("#");
	}

	private isInternalLink(link: string): boolean {
	const urlObj = new URL(link, this.baseUrl);
	const baseDomain = new URL(this.baseUrl).hostname
	.replace(/^www\./, "")
	.trim();
	const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();

	return linkDomain === baseDomain;
	}

	private isSubdomain(link: string): boolean {
	return new URL(link, this.baseUrl).hostname.endsWith(
	"." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."),
	);
	}

	public isFile(url: string): boolean {
	const fileExtensions = [
	".png",
	".jpg",
	".jpeg",
	".gif",
	".css",
	".js",
	".ico",
	".svg",
	".tiff",
	// ".pdf",
	".zip",
	".exe",
	".dmg",
	".mp4",
	".mp3",
	".wav",
	".pptx",
	// ".docx",
	".xlsx",
	// ".xml",
	".avi",
	".flv",
	".woff",
	".ttf",
	".woff2",
	".webp",
	".inc",
	];

	try {
	const urlWithoutQuery = url.split("?")[0].toLowerCase();
	return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
	} catch (error) {
	this.logger.error(`Error processing URL in isFile`, {
	method: "isFile",
	error,
	});
	return false;
	}
	}

	private isSocialMediaOrEmail(url: string): boolean {
	const socialMediaOrEmail = [
	"facebook.com",
	"twitter.com",
	"linkedin.com",
	"instagram.com",
	"pinterest.com",
	"mailto:",
	"github.com",
	"calendly.com",
	"discord.gg",
	"discord.com",
	];
	return socialMediaOrEmail.some((ext) => url.includes(ext));
	}

	private async tryFetchSitemapLinks(
	url: string,
	urlsHandler: (urls: string[]) => unknown,
	abort?: AbortSignal,
	mock?: string,
	): Promise<number> {
	const sitemapUrl = url.endsWith(".xml")
	? url
	: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;

	let sitemapCount: number = 0;

	// Try to get sitemap from the provided URL first
	try {
	sitemapCount = await getLinksFromSitemap(
	{ sitemapUrl, urlsHandler, mode: "fire-engine" },
	this.logger,
	this.jobId,
	this.sitemapsHit,
	abort,
	mock,
	);
	} catch (error) {
	if (error instanceof TimeoutSignal) {
	throw error;
	} else {
	this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
	method: "tryFetchSitemapLinks",
	sitemapUrl,
	error,
	});
	}
	}

	// If this is a subdomain, also try to get sitemap from the main domain
	try {
	const urlObj = new URL(url);
	const hostname = urlObj.hostname;
	const domainParts = hostname.split(".");

	// Check if this is a subdomain (has more than 2 parts and not www)
	if (domainParts.length > 2 && domainParts[0] !== "www") {
	// Get the main domain by taking the last two parts
	const mainDomain = domainParts.slice(-2).join(".");
	const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
	const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;

	try {
	// Get all links from the main domain's sitemap
	sitemapCount += await getLinksFromSitemap(
	{
	sitemapUrl: mainDomainSitemapUrl,
	urlsHandler(urls) {
	return urlsHandler(
	urls.filter((link) => {
	try {
	const linkUrl = new URL(link);
	return linkUrl.hostname.endsWith(hostname);
	} catch {}
	}),
	);
	},
	mode: "fire-engine",
	},
	this.logger,
	this.jobId,
	this.sitemapsHit,
	abort,
	mock,
	);
	} catch (error) {
	if (error instanceof TimeoutSignal) {
	throw error;
	} else {
	this.logger.debug(
	`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
	{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
	);
	}
	}
	}
	} catch (error) {
	if (error instanceof TimeoutSignal) {
	throw error;
	} else {
	this.logger.debug(`Error processing main domain sitemap`, {
	method: "tryFetchSitemapLinks",
	url,
	error,
	});
	}
	}

	// If no sitemap found yet, try the baseUrl as a last resort
	if (sitemapCount === 0) {
	const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
	try {
	sitemapCount += await getLinksFromSitemap(
	{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
	this.logger,
	this.jobId,
	this.sitemapsHit,
	abort,
	mock,
	);
	} catch (error) {
	if (error instanceof TimeoutSignal) {
	throw error;
	} else {
	this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
	method: "tryFetchSitemapLinks",
	sitemapUrl: baseUrlSitemap,
	error,
	});
	if (error instanceof AxiosError && error.response?.status === 404) {
	// ignore 404
	} else {
	sitemapCount += await getLinksFromSitemap(
	{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
	this.logger,
	this.jobId,
	this.sitemapsHit,
	abort,
	mock,
	);
	}
	}
	}
	}

	if (this.sitemapsHit.size >= 20) {
	this.logger.warn("Sitemap limit hit!", { crawlId: this.jobId, url: this.baseUrl });
	}

	return sitemapCount;
	}
	}