mapleeit commited on
Commit
f825045
·
1 Parent(s): db43264

fix(adaptive-crawler): fix cache problem

Browse files
backend/functions/src/cloud-functions/adaptive-crawler.ts CHANGED
@@ -1,6 +1,8 @@
1
  import {
 
2
  assignTransferProtocolMeta,
3
  HashManager,
 
4
  RPCHost, RPCReflection,
5
  } from 'civkit';
6
  import { singleton } from 'tsyringe';
@@ -33,6 +35,8 @@ const removeURLHash = (url: string) => {
33
  @singleton()
34
  export class AdaptiveCrawlerHost extends RPCHost {
35
  logger = this.globalLogger.child({ service: this.constructor.name });
 
 
36
 
37
  static readonly __singleCrawlQueueName = 'singleCrawlQueue';
38
 
@@ -105,8 +109,13 @@ export class AdaptiveCrawlerHost extends RPCHost {
105
  const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
106
  const existing = await AdaptiveCrawlTask.fromFirestore(shortDigest);
107
 
108
- if (existing) {
109
- return { taskId: shortDigest };
 
 
 
 
 
110
  }
111
 
112
  await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).set({
@@ -182,11 +191,19 @@ export class AdaptiveCrawlerHost extends RPCHost {
182
  @Param('urls') urls: string[] = [],
183
  ) {
184
  if (!taskId) {
185
- throw new Error('taskId is required');
186
  }
187
 
188
  const state = await AdaptiveCrawlTask.fromFirestore(taskId);
189
 
 
 
 
 
 
 
 
 
190
  if (urls.length) {
191
  const promises = Object.entries(state?.processed ?? {}).map(async ([url, cachePath]) => {
192
  if (urls.includes(url)) {
 
1
  import {
2
+ AssertionFailureError,
3
  assignTransferProtocolMeta,
4
  HashManager,
5
+ ParamValidationError,
6
  RPCHost, RPCReflection,
7
  } from 'civkit';
8
  import { singleton } from 'tsyringe';
 
35
  @singleton()
36
  export class AdaptiveCrawlerHost extends RPCHost {
37
  logger = this.globalLogger.child({ service: this.constructor.name });
38
+ // Actual cache storage (gcp buckets) exists for 7 days, so here we need to select a time < 7 days.
39
+ cacheExpiry = 3 * 1000 * 60 * 60 * 24;
40
 
41
  static readonly __singleCrawlQueueName = 'singleCrawlQueue';
42
 
 
109
  const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
110
  const existing = await AdaptiveCrawlTask.fromFirestore(shortDigest);
111
 
112
+ if (existing?.createdAt) {
113
+ if (existing.createdAt.getTime() > Date.now() - this.cacheExpiry) {
114
+ this.logger.info(`Cache hit for ${shortDigest}, created at ${existing.createdAt.toDateString()}`);
115
+ return { taskId: shortDigest };
116
+ } else {
117
+ this.logger.info(`Cache expired for ${shortDigest}, created at ${existing.createdAt.toDateString()}`);
118
+ }
119
  }
120
 
121
  await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).set({
 
191
  @Param('urls') urls: string[] = [],
192
  ) {
193
  if (!taskId) {
194
+ throw new ParamValidationError('taskId is required');
195
  }
196
 
197
  const state = await AdaptiveCrawlTask.fromFirestore(taskId);
198
 
199
+ if (!state) {
200
+ throw new AssertionFailureError('The task does not exist');
201
+ }
202
+
203
+ if (state?.createdAt && state.createdAt.getTime() < Date.now() - this.cacheExpiry) {
204
+ throw new AssertionFailureError('The task has expired');
205
+ }
206
+
207
  if (urls.length) {
208
  const promises = Object.entries(state?.processed ?? {}).map(async ([url, cachePath]) => {
209
  if (urls.includes(url)) {