Spaces:
Paused
Paused
| import axios, { AxiosError } from "axios"; | |
| import { load } from "cheerio"; // rustified | |
| import { URL } from "url"; | |
| import { getLinksFromSitemap } from "./sitemap"; | |
| import robotsParser, { Robot } from "robots-parser"; | |
| import { getURLDepth } from "./utils/maxDepthUtils"; | |
| import { axiosTimeout } from "../../lib/timeout"; | |
| import { logger as _logger } from "../../lib/logger"; | |
| import https from "https"; | |
| import { redisConnection } from "../../services/queue-service"; | |
| import { extractLinks } from "../../lib/html-transformer"; | |
| import { TimeoutSignal } from "../../controllers/v1/types"; | |
| export class WebCrawler { | |
| private jobId: string; | |
| private initialUrl: string; | |
| private baseUrl: string; | |
| private includes: string[]; | |
| private excludes: string[]; | |
| private maxCrawledLinks: number; | |
| private maxCrawledDepth: number; | |
| private visited: Set<string> = new Set(); | |
| private crawledUrls: Map<string, string> = new Map(); | |
| private limit: number; | |
| private robotsTxtUrl: string; | |
| public robots: Robot; | |
| private generateImgAltText: boolean; | |
| private allowBackwardCrawling: boolean; | |
| private allowExternalContentLinks: boolean; | |
| private allowSubdomains: boolean; | |
| private ignoreRobotsTxt: boolean; | |
| private regexOnFullURL: boolean; | |
| private logger: typeof _logger; | |
| private sitemapsHit: Set<string> = new Set(); | |
| private maxDiscoveryDepth: number | undefined; | |
| private currentDiscoveryDepth: number; | |
| constructor({ | |
| jobId, | |
| initialUrl, | |
| baseUrl, | |
| includes, | |
| excludes, | |
| maxCrawledLinks = 10000, | |
| limit = 10000, | |
| generateImgAltText = false, | |
| maxCrawledDepth = 10, | |
| allowBackwardCrawling = false, | |
| allowExternalContentLinks = false, | |
| allowSubdomains = false, | |
| ignoreRobotsTxt = false, | |
| regexOnFullURL = false, | |
| maxDiscoveryDepth, | |
| currentDiscoveryDepth, | |
| }: { | |
| jobId: string; | |
| initialUrl: string; | |
| baseUrl?: string; | |
| includes?: string[]; | |
| excludes?: string[]; | |
| maxCrawledLinks?: number; | |
| limit?: number; | |
| generateImgAltText?: boolean; | |
| maxCrawledDepth?: number; | |
| allowBackwardCrawling?: boolean; | |
| allowExternalContentLinks?: boolean; | |
| allowSubdomains?: boolean; | |
| ignoreRobotsTxt?: boolean; | |
| regexOnFullURL?: boolean; | |
| maxDiscoveryDepth?: number; | |
| currentDiscoveryDepth?: number; | |
| }) { | |
| this.jobId = jobId; | |
| this.initialUrl = initialUrl; | |
| this.baseUrl = baseUrl ?? new URL(initialUrl).origin; | |
| this.includes = Array.isArray(includes) ? includes : []; | |
| this.excludes = Array.isArray(excludes) ? excludes : []; | |
| this.limit = limit; | |
| this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`; | |
| this.robots = robotsParser(this.robotsTxtUrl, ""); | |
| // Deprecated, use limit instead | |
| this.maxCrawledLinks = maxCrawledLinks ?? limit; | |
| this.maxCrawledDepth = maxCrawledDepth ?? 10; | |
| this.generateImgAltText = generateImgAltText ?? false; | |
| this.allowBackwardCrawling = allowBackwardCrawling ?? false; | |
| this.allowExternalContentLinks = allowExternalContentLinks ?? false; | |
| this.allowSubdomains = allowSubdomains ?? false; | |
| this.ignoreRobotsTxt = ignoreRobotsTxt ?? false; | |
| this.regexOnFullURL = regexOnFullURL ?? false; | |
| this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" }); | |
| this.maxDiscoveryDepth = maxDiscoveryDepth; | |
| this.currentDiscoveryDepth = currentDiscoveryDepth ?? 0; | |
| } | |
| public filterLinks( | |
| sitemapLinks: string[], | |
| limit: number, | |
| maxDepth: number, | |
| fromMap: boolean = false, | |
| ): string[] { | |
| if (this.currentDiscoveryDepth === this.maxDiscoveryDepth) { | |
| this.logger.debug("Max discovery depth hit, filtering off all links", { currentDiscoveryDepth: this.currentDiscoveryDepth, maxDiscoveryDepth: this.maxDiscoveryDepth }); | |
| return []; | |
| } | |
| // If the initial URL is a sitemap.xml, skip filtering | |
| if (this.initialUrl.endsWith("sitemap.xml") && fromMap) { | |
| return sitemapLinks.slice(0, limit); | |
| } | |
| return sitemapLinks | |
| .filter((link) => { | |
| let url: URL; | |
| try { | |
| url = new URL(link.trim(), this.baseUrl); | |
| } catch (error) { | |
| this.logger.debug(`Error processing link: ${link}`, { | |
| link, | |
| error, | |
| method: "filterLinks", | |
| }); | |
| return false; | |
| } | |
| const path = url.pathname; | |
| const depth = getURLDepth(url.toString()); | |
| // Check if the link exceeds the maximum depth allowed | |
| if (depth > maxDepth) { | |
| if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { | |
| this.logger.debug(`${link} DEPTH FAIL`); | |
| } | |
| return false; | |
| } | |
| const excincPath = this.regexOnFullURL ? link : path; | |
| // Check if the link should be excluded | |
| if (this.excludes.length > 0 && this.excludes[0] !== "") { | |
| if ( | |
| this.excludes.some((excludePattern) => | |
| new RegExp(excludePattern).test(excincPath), | |
| ) | |
| ) { | |
| if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { | |
| this.logger.debug(`${link} EXCLUDE FAIL`); | |
| } | |
| return false; | |
| } | |
| } | |
| // Check if the link matches the include patterns, if any are specified | |
| if (this.includes.length > 0 && this.includes[0] !== "") { | |
| if ( | |
| !this.includes.some((includePattern) => | |
| new RegExp(includePattern).test(excincPath), | |
| ) | |
| ) { | |
| if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { | |
| this.logger.debug(`${link} INCLUDE FAIL`); | |
| } | |
| return false; | |
| } | |
| } | |
| // Normalize the initial URL and the link to account for www and non-www versions | |
| const normalizedInitialUrl = new URL(this.initialUrl); | |
| let normalizedLink; | |
| try { | |
| normalizedLink = new URL(link); | |
| } catch (_) { | |
| if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { | |
| this.logger.debug(`${link} URL PARSE FAIL`); | |
| } | |
| return false; | |
| } | |
| const initialHostname = normalizedInitialUrl.hostname.replace( | |
| /^www\./, | |
| "", | |
| ); | |
| const linkHostname = normalizedLink.hostname.replace(/^www\./, ""); | |
| // Ensure the protocol and hostname match, and the path starts with the initial URL's path | |
| // commented to able to handling external link on allowExternalContentLinks | |
| // if (linkHostname !== initialHostname) { | |
| // return false; | |
| // } | |
| if (!this.allowBackwardCrawling) { | |
| if ( | |
| !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname) | |
| ) { | |
| if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { | |
| this.logger.debug(`${link} BACKWARDS FAIL ${normalizedLink.pathname} ${normalizedInitialUrl.pathname}`); | |
| } | |
| return false; | |
| } | |
| } | |
| const isAllowed = this.ignoreRobotsTxt | |
| ? true | |
| : ((this.robots.isAllowed(link, "FireCrawlAgent") || this.robots.isAllowed(link, "FirecrawlAgent")) ?? true); | |
| // Check if the link is disallowed by robots.txt | |
| if (!isAllowed) { | |
| this.logger.debug(`Link disallowed by robots.txt: ${link}`, { | |
| method: "filterLinks", | |
| link, | |
| }); | |
| if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { | |
| this.logger.debug(`${link} ROBOTS FAIL`); | |
| } | |
| return false; | |
| } | |
| if (this.isFile(link)) { | |
| if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { | |
| this.logger.debug(`${link} FILE FAIL`); | |
| } | |
| return false; | |
| } | |
| if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { | |
| this.logger.debug(`${link} OK`); | |
| } | |
| return true; | |
| }) | |
| .slice(0, limit); | |
| } | |
| public async getRobotsTxt(skipTlsVerification = false, abort?: AbortSignal): Promise<string> { | |
| let extraArgs = {}; | |
| if (skipTlsVerification) { | |
| extraArgs["httpsAgent"] = new https.Agent({ | |
| rejectUnauthorized: false, | |
| }); | |
| } | |
| const response = await axios.get(this.robotsTxtUrl, { | |
| timeout: axiosTimeout, | |
| signal: abort, | |
| ...extraArgs, | |
| }); | |
| return response.data; | |
| } | |
| public importRobotsTxt(txt: string) { | |
| this.robots = robotsParser(this.robotsTxtUrl, txt); | |
| } | |
| public async tryGetSitemap( | |
| urlsHandler: (urls: string[]) => unknown, | |
| fromMap: boolean = false, | |
| onlySitemap: boolean = false, | |
| timeout: number = 120000, | |
| abort?: AbortSignal, | |
| mock?: string, | |
| ): Promise<number> { | |
| this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { | |
| method: "tryGetSitemap", | |
| }); | |
| let leftOfLimit = this.limit; | |
| const normalizeUrl = (url: string) => { | |
| url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); | |
| if (url.endsWith("/")) { | |
| url = url.slice(0, -1); | |
| } | |
| return url; | |
| }; | |
| const _urlsHandler = async (urls: string[]) => { | |
| if (fromMap && onlySitemap) { | |
| return urlsHandler(urls); | |
| } else { | |
| let filteredLinks = this.filterLinks( | |
| [...new Set(urls)].filter(x => this.filterURL(x, this.initialUrl) !== null), | |
| leftOfLimit, | |
| this.maxCrawledDepth, | |
| fromMap, | |
| ); | |
| leftOfLimit -= filteredLinks.length; | |
| let uniqueURLs: string[] = []; | |
| for (const url of filteredLinks) { | |
| if ( | |
| await redisConnection.sadd( | |
| "sitemap:" + this.jobId + ":links", | |
| normalizeUrl(url), | |
| ) | |
| ) { | |
| uniqueURLs.push(url); | |
| } | |
| } | |
| await redisConnection.expire( | |
| "sitemap:" + this.jobId + ":links", | |
| 3600, | |
| "NX", | |
| ); | |
| if (uniqueURLs.length > 0) { | |
| return urlsHandler(uniqueURLs); | |
| } | |
| } | |
| }; | |
| const timeoutPromise = new Promise((_, reject) => { | |
| setTimeout(() => reject(new Error("Sitemap fetch timeout")), timeout); | |
| }); | |
| try { | |
| let count = (await Promise.race([ | |
| Promise.all([ | |
| this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort, mock), | |
| ...this.robots | |
| .getSitemaps() | |
| .map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort, mock)), | |
| ]).then((results) => results.reduce((a, x) => a + x, 0)), | |
| timeoutPromise, | |
| ])) as number; | |
| if (count > 0) { | |
| if ( | |
| await redisConnection.sadd( | |
| "sitemap:" + this.jobId + ":links", | |
| normalizeUrl(this.initialUrl), | |
| ) | |
| ) { | |
| urlsHandler([this.initialUrl]); | |
| } | |
| count++; | |
| } | |
| return count; | |
| } catch (error) { | |
| if (error.message === "Sitemap fetch timeout") { | |
| this.logger.warn("Sitemap fetch timed out", { | |
| method: "tryGetSitemap", | |
| timeout, | |
| }); | |
| return 0; | |
| } | |
| this.logger.error("Error fetching sitemap", { | |
| method: "tryGetSitemap", | |
| error, | |
| }); | |
| return 0; | |
| } | |
| } | |
| public filterURL(href: string, url: string): string | null { | |
| let fullUrl = href; | |
| if (!href.startsWith("http")) { | |
| try { | |
| fullUrl = new URL(href, url).toString(); | |
| } catch (_) { | |
| return null; | |
| } | |
| } | |
| let urlObj; | |
| try { | |
| urlObj = new URL(fullUrl); | |
| } catch (_) { | |
| return null; | |
| } | |
| const path = urlObj.pathname; | |
| if (this.isInternalLink(fullUrl)) { | |
| // INTERNAL LINKS | |
| if ( | |
| this.isInternalLink(fullUrl) && | |
| this.noSections(fullUrl) && | |
| !this.matchesExcludes(path) && | |
| this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt) | |
| ) { | |
| return fullUrl; | |
| } else if ( | |
| this.isInternalLink(fullUrl) && | |
| this.noSections(fullUrl) && | |
| !this.matchesExcludes(path) && | |
| !this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt) | |
| ) { | |
| (async () => { | |
| await redisConnection.sadd( | |
| "crawl:" + this.jobId + ":robots_blocked", | |
| fullUrl, | |
| ); | |
| await redisConnection.expire( | |
| "crawl:" + this.jobId + ":robots_blocked", | |
| 24 * 60 * 60, | |
| ); | |
| })(); | |
| } | |
| } else { | |
| // EXTERNAL LINKS | |
| if ( | |
| this.isInternalLink(url) && | |
| this.allowExternalContentLinks && | |
| !this.isSocialMediaOrEmail(fullUrl) && | |
| !this.matchesExcludes(fullUrl, true) && | |
| !this.isExternalMainPage(fullUrl) | |
| ) { | |
| return fullUrl; | |
| } | |
| } | |
| if ( | |
| this.allowSubdomains && | |
| !this.isSocialMediaOrEmail(fullUrl) && | |
| this.isSubdomain(fullUrl) | |
| ) { | |
| return fullUrl; | |
| } | |
| return null; | |
| } | |
| private async extractLinksFromHTMLRust(html: string, url: string) { | |
| return (await extractLinks(html)).filter(x => this.filterURL(x, url)); | |
| } | |
| private extractLinksFromHTMLCheerio(html: string, url: string) { | |
| let links: string[] = []; | |
| const $ = load(html); | |
| $("a").each((_, element) => { | |
| let href = $(element).attr("href"); | |
| if (href) { | |
| if (href.match(/^https?:\/[^\/]/)) { | |
| href = href.replace(/^https?:\//, "$&/"); | |
| } | |
| const u = this.filterURL(href, url); | |
| if (u !== null) { | |
| links.push(u); | |
| } | |
| } | |
| }); | |
| // Extract links from iframes with inline src | |
| $("iframe").each((_, element) => { | |
| const src = $(element).attr("src"); | |
| if (src && src.startsWith("data:text/html")) { | |
| const iframeHtml = decodeURIComponent(src.split(",")[1]); | |
| const iframeLinks = this.extractLinksFromHTMLCheerio(iframeHtml, url); | |
| links = links.concat(iframeLinks); | |
| } | |
| }); | |
| return links; | |
| } | |
| public async extractLinksFromHTML(html: string, url: string) { | |
| try { | |
| return [...new Set((await this.extractLinksFromHTMLRust(html, url)).map(x => { | |
| try { | |
| return new URL(x, url).href | |
| } catch (e) { | |
| return null; | |
| } | |
| }).filter(x => x !== null) as string[])]; | |
| } catch (error) { | |
| this.logger.warn("Failed to call html-transformer! Falling back to cheerio...", { | |
| error, | |
| module: "scrapeURL", method: "extractMetadata" | |
| }); | |
| } | |
| return this.extractLinksFromHTMLCheerio(html, url); | |
| } | |
| private isRobotsAllowed( | |
| url: string, | |
| ignoreRobotsTxt: boolean = false, | |
| ): boolean { | |
| return ignoreRobotsTxt | |
| ? true | |
| : this.robots | |
| ? ((this.robots.isAllowed(url, "FireCrawlAgent") || this.robots.isAllowed(url, "FirecrawlAgent")) ?? true) | |
| : true; | |
| } | |
| private matchesExcludes(url: string, onlyDomains: boolean = false): boolean { | |
| return this.excludes.some((pattern) => { | |
| if (onlyDomains) return this.matchesExcludesExternalDomains(url); | |
| return this.excludes.some((pattern) => new RegExp(pattern).test(url)); | |
| }); | |
| } | |
| // supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com" | |
| private matchesExcludesExternalDomains(url: string) { | |
| try { | |
| const urlObj = new URL(url); | |
| const hostname = urlObj.hostname; | |
| const pathname = urlObj.pathname; | |
| for (let domain of this.excludes) { | |
| let domainObj = new URL("http://" + domain.replace(/^https?:\/\//, "")); | |
| let domainHostname = domainObj.hostname; | |
| let domainPathname = domainObj.pathname; | |
| if ( | |
| hostname === domainHostname || | |
| hostname.endsWith(`.${domainHostname}`) | |
| ) { | |
| if (pathname.startsWith(domainPathname)) { | |
| return true; | |
| } | |
| } | |
| } | |
| return false; | |
| } catch (e) { | |
| return false; | |
| } | |
| } | |
| private isExternalMainPage(url: string): boolean { | |
| return !Boolean( | |
| url | |
| .split("/") | |
| .slice(3) | |
| .filter((subArray) => subArray.length > 0).length, | |
| ); | |
| } | |
| private noSections(link: string): boolean { | |
| return !link.includes("#"); | |
| } | |
| private isInternalLink(link: string): boolean { | |
| const urlObj = new URL(link, this.baseUrl); | |
| const baseDomain = new URL(this.baseUrl).hostname | |
| .replace(/^www\./, "") | |
| .trim(); | |
| const linkDomain = urlObj.hostname.replace(/^www\./, "").trim(); | |
| return linkDomain === baseDomain; | |
| } | |
| private isSubdomain(link: string): boolean { | |
| return new URL(link, this.baseUrl).hostname.endsWith( | |
| "." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."), | |
| ); | |
| } | |
| public isFile(url: string): boolean { | |
| const fileExtensions = [ | |
| ".png", | |
| ".jpg", | |
| ".jpeg", | |
| ".gif", | |
| ".css", | |
| ".js", | |
| ".ico", | |
| ".svg", | |
| ".tiff", | |
| // ".pdf", | |
| ".zip", | |
| ".exe", | |
| ".dmg", | |
| ".mp4", | |
| ".mp3", | |
| ".wav", | |
| ".pptx", | |
| // ".docx", | |
| ".xlsx", | |
| // ".xml", | |
| ".avi", | |
| ".flv", | |
| ".woff", | |
| ".ttf", | |
| ".woff2", | |
| ".webp", | |
| ".inc", | |
| ]; | |
| try { | |
| const urlWithoutQuery = url.split("?")[0].toLowerCase(); | |
| return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext)); | |
| } catch (error) { | |
| this.logger.error(`Error processing URL in isFile`, { | |
| method: "isFile", | |
| error, | |
| }); | |
| return false; | |
| } | |
| } | |
| private isSocialMediaOrEmail(url: string): boolean { | |
| const socialMediaOrEmail = [ | |
| "facebook.com", | |
| "twitter.com", | |
| "linkedin.com", | |
| "instagram.com", | |
| "pinterest.com", | |
| "mailto:", | |
| "github.com", | |
| "calendly.com", | |
| "discord.gg", | |
| "discord.com", | |
| ]; | |
| return socialMediaOrEmail.some((ext) => url.includes(ext)); | |
| } | |
| private async tryFetchSitemapLinks( | |
| url: string, | |
| urlsHandler: (urls: string[]) => unknown, | |
| abort?: AbortSignal, | |
| mock?: string, | |
| ): Promise<number> { | |
| const sitemapUrl = url.endsWith(".xml") | |
| ? url | |
| : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`; | |
| let sitemapCount: number = 0; | |
| // Try to get sitemap from the provided URL first | |
| try { | |
| sitemapCount = await getLinksFromSitemap( | |
| { sitemapUrl, urlsHandler, mode: "fire-engine" }, | |
| this.logger, | |
| this.jobId, | |
| this.sitemapsHit, | |
| abort, | |
| mock, | |
| ); | |
| } catch (error) { | |
| if (error instanceof TimeoutSignal) { | |
| throw error; | |
| } else { | |
| this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, { | |
| method: "tryFetchSitemapLinks", | |
| sitemapUrl, | |
| error, | |
| }); | |
| } | |
| } | |
| // If this is a subdomain, also try to get sitemap from the main domain | |
| try { | |
| const urlObj = new URL(url); | |
| const hostname = urlObj.hostname; | |
| const domainParts = hostname.split("."); | |
| // Check if this is a subdomain (has more than 2 parts and not www) | |
| if (domainParts.length > 2 && domainParts[0] !== "www") { | |
| // Get the main domain by taking the last two parts | |
| const mainDomain = domainParts.slice(-2).join("."); | |
| const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`; | |
| const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`; | |
| try { | |
| // Get all links from the main domain's sitemap | |
| sitemapCount += await getLinksFromSitemap( | |
| { | |
| sitemapUrl: mainDomainSitemapUrl, | |
| urlsHandler(urls) { | |
| return urlsHandler( | |
| urls.filter((link) => { | |
| try { | |
| const linkUrl = new URL(link); | |
| return linkUrl.hostname.endsWith(hostname); | |
| } catch {} | |
| }), | |
| ); | |
| }, | |
| mode: "fire-engine", | |
| }, | |
| this.logger, | |
| this.jobId, | |
| this.sitemapsHit, | |
| abort, | |
| mock, | |
| ); | |
| } catch (error) { | |
| if (error instanceof TimeoutSignal) { | |
| throw error; | |
| } else { | |
| this.logger.debug( | |
| `Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`, | |
| { method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error }, | |
| ); | |
| } | |
| } | |
| } | |
| } catch (error) { | |
| if (error instanceof TimeoutSignal) { | |
| throw error; | |
| } else { | |
| this.logger.debug(`Error processing main domain sitemap`, { | |
| method: "tryFetchSitemapLinks", | |
| url, | |
| error, | |
| }); | |
| } | |
| } | |
| // If no sitemap found yet, try the baseUrl as a last resort | |
| if (sitemapCount === 0) { | |
| const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; | |
| try { | |
| sitemapCount += await getLinksFromSitemap( | |
| { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, | |
| this.logger, | |
| this.jobId, | |
| this.sitemapsHit, | |
| abort, | |
| mock, | |
| ); | |
| } catch (error) { | |
| if (error instanceof TimeoutSignal) { | |
| throw error; | |
| } else { | |
| this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { | |
| method: "tryFetchSitemapLinks", | |
| sitemapUrl: baseUrlSitemap, | |
| error, | |
| }); | |
| if (error instanceof AxiosError && error.response?.status === 404) { | |
| // ignore 404 | |
| } else { | |
| sitemapCount += await getLinksFromSitemap( | |
| { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, | |
| this.logger, | |
| this.jobId, | |
| this.sitemapsHit, | |
| abort, | |
| mock, | |
| ); | |
| } | |
| } | |
| } | |
| } | |
| if (this.sitemapsHit.size >= 20) { | |
| this.logger.warn("Sitemap limit hit!", { crawlId: this.jobId, url: this.baseUrl }); | |
| } | |
| return sitemapCount; | |
| } | |
| } | |