Spaces:
Build error
Build error
fix(adaptive-crawler): fix cache problem
Browse files
backend/functions/src/cloud-functions/adaptive-crawler.ts
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import {
|
|
|
|
| 2 |
assignTransferProtocolMeta,
|
| 3 |
HashManager,
|
|
|
|
| 4 |
RPCHost, RPCReflection,
|
| 5 |
} from 'civkit';
|
| 6 |
import { singleton } from 'tsyringe';
|
|
@@ -33,6 +35,8 @@ const removeURLHash = (url: string) => {
|
|
| 33 |
@singleton()
|
| 34 |
export class AdaptiveCrawlerHost extends RPCHost {
|
| 35 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
|
|
|
|
|
|
| 36 |
|
| 37 |
static readonly __singleCrawlQueueName = 'singleCrawlQueue';
|
| 38 |
|
|
@@ -105,8 +109,13 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
|
| 105 |
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
| 106 |
const existing = await AdaptiveCrawlTask.fromFirestore(shortDigest);
|
| 107 |
|
| 108 |
-
if (existing) {
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
}
|
| 111 |
|
| 112 |
await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).set({
|
|
@@ -182,11 +191,19 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
|
| 182 |
@Param('urls') urls: string[] = [],
|
| 183 |
) {
|
| 184 |
if (!taskId) {
|
| 185 |
-
throw new
|
| 186 |
}
|
| 187 |
|
| 188 |
const state = await AdaptiveCrawlTask.fromFirestore(taskId);
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
if (urls.length) {
|
| 191 |
const promises = Object.entries(state?.processed ?? {}).map(async ([url, cachePath]) => {
|
| 192 |
if (urls.includes(url)) {
|
|
|
|
| 1 |
import {
|
| 2 |
+
AssertionFailureError,
|
| 3 |
assignTransferProtocolMeta,
|
| 4 |
HashManager,
|
| 5 |
+
ParamValidationError,
|
| 6 |
RPCHost, RPCReflection,
|
| 7 |
} from 'civkit';
|
| 8 |
import { singleton } from 'tsyringe';
|
|
|
|
| 35 |
@singleton()
|
| 36 |
export class AdaptiveCrawlerHost extends RPCHost {
|
| 37 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 38 |
+
// Actual cache storage (gcp buckets) exists for 7 days, so here we need to select a time < 7 days.
|
| 39 |
+
cacheExpiry = 3 * 1000 * 60 * 60 * 24;
|
| 40 |
|
| 41 |
static readonly __singleCrawlQueueName = 'singleCrawlQueue';
|
| 42 |
|
|
|
|
| 109 |
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
| 110 |
const existing = await AdaptiveCrawlTask.fromFirestore(shortDigest);
|
| 111 |
|
| 112 |
+
if (existing?.createdAt) {
|
| 113 |
+
if (existing.createdAt.getTime() > Date.now() - this.cacheExpiry) {
|
| 114 |
+
this.logger.info(`Cache hit for ${shortDigest}, created at ${existing.createdAt.toDateString()}`);
|
| 115 |
+
return { taskId: shortDigest };
|
| 116 |
+
} else {
|
| 117 |
+
this.logger.info(`Cache expired for ${shortDigest}, created at ${existing.createdAt.toDateString()}`);
|
| 118 |
+
}
|
| 119 |
}
|
| 120 |
|
| 121 |
await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).set({
|
|
|
|
| 191 |
@Param('urls') urls: string[] = [],
|
| 192 |
) {
|
| 193 |
if (!taskId) {
|
| 194 |
+
throw new ParamValidationError('taskId is required');
|
| 195 |
}
|
| 196 |
|
| 197 |
const state = await AdaptiveCrawlTask.fromFirestore(taskId);
|
| 198 |
|
| 199 |
+
if (!state) {
|
| 200 |
+
throw new AssertionFailureError('The task does not exist');
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
if (state?.createdAt && state.createdAt.getTime() < Date.now() - this.cacheExpiry) {
|
| 204 |
+
throw new AssertionFailureError('The task has expired');
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
if (urls.length) {
|
| 208 |
const promises = Object.entries(state?.processed ?? {}).map(async ([url, cachePath]) => {
|
| 209 |
if (urls.includes(url)) {
|