Spaces:
Build error
Build error
chore: dont abuse our service
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -571,10 +571,15 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 571 |
if (blockade) {
|
| 572 |
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
| 573 |
}
|
| 574 |
-
|
|
|
|
|
|
|
|
|
|
| 575 |
|
|
|
|
| 576 |
const crawlOpts = this.configure(crawlerOptions);
|
| 577 |
|
|
|
|
| 578 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 579 |
const sseStream = new OutputServerEventStream();
|
| 580 |
rpcReflect.return(sseStream);
|
|
@@ -767,7 +772,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 767 |
return r;
|
| 768 |
}
|
| 769 |
|
| 770 |
-
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
| 771 |
let cache;
|
| 772 |
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
| 773 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
|
@@ -821,7 +826,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 821 |
}
|
| 822 |
|
| 823 |
|
| 824 |
-
async *scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) {
|
| 825 |
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
|
| 826 |
|
| 827 |
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
|
|
|
| 571 |
if (blockade) {
|
| 572 |
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
| 573 |
}
|
| 574 |
+
if (urlToCrawl.protocol === 'http:' && (!urlToCrawl.pathname || urlToCrawl.pathname === '/') &&
|
| 575 |
+
crawlerOptions.respondWith === 'html') {
|
| 576 |
+
throw new SecurityCompromiseError(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
|
| 577 |
+
}
|
| 578 |
|
| 579 |
+
}
|
| 580 |
const crawlOpts = this.configure(crawlerOptions);
|
| 581 |
|
| 582 |
+
|
| 583 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 584 |
const sseStream = new OutputServerEventStream();
|
| 585 |
rpcReflect.return(sseStream);
|
|
|
|
| 772 |
return r;
|
| 773 |
}
|
| 774 |
|
| 775 |
+
async * cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
| 776 |
let cache;
|
| 777 |
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
| 778 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
|
|
|
| 826 |
}
|
| 827 |
|
| 828 |
|
| 829 |
+
async * scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) {
|
| 830 |
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
|
| 831 |
|
| 832 |
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|