File size: 26,542 Bytes
45d1682
 
67d4a9f
23a3b80
6b9e14d
f7dbadf
33e14e5
53bc91c
 
 
 
 
 
 
53821d0
 
53bc91c
 
54abc17
53821d0
54abc17
f7dbadf
23a3b80
54abc17
 
f7dbadf
 
2a30fce
f7dbadf
 
 
 
 
 
53bc91c
165cce6
59dcc2d
 
e23d9f3
53821d0
59dcc2d
2606c44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165cce6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e5abd3
165cce6
 
 
 
d3f3a85
53bc91c
 
4e5abd3
 
 
53bc91c
165cce6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f37e5d
 
 
 
 
 
 
62fb6cf
 
 
 
 
 
 
165cce6
 
 
 
 
 
 
 
 
 
 
23a3b80
ce11f44
23a3b80
 
 
0da71ca
 
 
 
 
 
 
 
 
 
165cce6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59dcc2d
 
 
 
 
 
 
 
 
 
 
102a168
 
 
 
 
 
 
 
 
 
165cce6
 
 
 
 
a9936d3
 
 
 
 
de50c93
 
 
 
7e6c2fc
 
 
 
 
696536c
 
 
 
 
6c23342
f7dbadf
 
2a30fce
 
f7dbadf
2a30fce
 
 
f7dbadf
 
 
6c23342
4e5abd3
6c23342
 
 
53821d0
e23d9f3
53821d0
 
 
6b9e14d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5141814
6b9e14d
 
 
 
5141814
6b9e14d
 
 
165cce6
 
 
 
33e14e5
 
8739943
 
 
1c4b64f
 
 
53821d0
 
 
 
 
 
080056e
 
45d1682
080056e
45d1682
080056e
33e14e5
53bc91c
 
33e14e5
 
 
 
 
 
 
 
59dcc2d
 
 
33e14e5
 
 
53821d0
33e14e5
 
 
 
 
 
 
 
 
 
 
16cabca
 
 
 
 
33e14e5
 
 
6f37e5d
 
33e14e5
6f37e5d
 
 
 
 
33e14e5
62fb6cf
 
 
 
 
77c8480
 
22647a0
77c8480
22647a0
77c8480
102a168
 
 
 
 
33e14e5
 
 
67d4a9f
33e14e5
 
 
 
23a3b80
 
 
165cce6
 
 
cc6d2f3
6c23342
 
22647a0
 
 
 
 
 
 
 
 
 
a9936d3
 
61ff011
 
a9936d3
61ff011
a9936d3
de50c93
 
 
7e6c2fc
 
 
696536c
 
 
2606c44
 
 
06f3593
 
 
 
 
 
23a3b80
 
 
 
 
 
6b9e14d
 
 
f7dbadf
 
 
 
 
12ba1bc
 
33e14e5
 
23a3b80
33e14e5
23a3b80
 
33e14e5
 
53821d0
 
 
 
 
 
 
 
 
 
 
33e14e5
23a3b80
 
de50c93
 
 
23a3b80
 
7e6c2fc
 
 
23a3b80
 
33e14e5
 
23a3b80
 
53821d0
 
 
 
 
33e14e5
23a3b80
 
33e14e5
 
23a3b80
59dcc2d
 
 
 
 
 
23a3b80
 
33e14e5
 
9ac4060
 
 
23a3b80
33e14e5
 
 
 
23a3b80
16cabca
 
 
 
23a3b80
efe7a61
 
23a3b80
61ff011
a9936d3
 
23a3b80
 
 
 
 
 
 
165cce6
33e14e5
23a3b80
6c23342
 
 
53821d0
 
 
f7dbadf
53821d0
 
f7dbadf
6a58de5
53821d0
dc80020
6c23342
23a3b80
 
62fb6cf
 
23a3b80
 
22647a0
77c8480
 
 
 
23a3b80
102a168
 
 
 
 
 
62fb6cf
67d4a9f
23a3b80
33e14e5
 
 
67d4a9f
33e14e5
 
 
 
67d4a9f
33e14e5
 
c7860e6
33e14e5
23a3b80
 
 
 
 
 
33e14e5
23a3b80
696536c
 
23a3b80
e23d9f3
 
 
53821d0
23a3b80
 
 
f7dbadf
 
 
 
 
696536c
 
 
 
6b9e14d
 
 
 
9ac4060
 
22647a0
9415c6a
 
 
 
2a30fce
9415c6a
 
 
 
 
 
 
 
 
f7dbadf
 
22647a0
 
9415c6a
 
f7dbadf
 
 
 
 
 
 
 
 
 
 
9415c6a
f7dbadf
 
 
 
 
8121d62
22647a0
 
2a30fce
 
 
ff595c2
 
 
9415c6a
22647a0
 
9415c6a
8121d62
 
53821d0
51a4877
 
d2afa9d
 
 
22647a0
d2afa9d
22647a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2606c44
 
 
22647a0
 
 
53bc91c
 
 
 
54abc17
6a58de5
2a30fce
f7dbadf
 
c19ba65
54abc17
 
 
6a58de5
 
 
 
 
 
54abc17
 
 
 
 
 
 
 
 
 
 
 
 
 
9ac4060
 
 
 
 
 
 
 
33e14e5
 
3020d58
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc';
import { FancyFile } from 'civkit/fancy-file';
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
import { Context } from '../services/registry';
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
import type { PageSnapshot } from '../services/puppeteer';

export enum CONTENT_FORMAT {
    CONTENT = 'content',
    MARKDOWN = 'markdown',
    HTML = 'html',
    TEXT = 'text',
    PAGESHOT = 'pageshot',
    SCREENSHOT = 'screenshot',
    VLM = 'vlm',
    READER_LM = 'readerlm-v2',
}

export enum ENGINE_TYPE {
    AUTO = 'auto',
    BROWSER = 'browser',
    CURL = 'curl',
    CF_BROWSER_RENDERING = 'cf-browser-rendering',
}

export enum RESPOND_TIMING {
    HTML = 'html',
    VISIBLE_CONTENT = 'visible-content',
    MUTATION_IDLE = 'mutation-idle',
    RESOURCE_IDLE = 'resource-idle',
    MEDIA_IDLE = 'media-idle',
    NETWORK_IDLE = 'network-idle',
}

const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));

export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
export const BASE_URL_MODES = ['initial', 'final'] as const;
const BASE_URL_MODE_VALUES = new Set<string>(BASE_URL_MODES);

class Viewport extends AutoCastable {
    @Prop({
        default: 1024
    })
    width!: number;
    @Prop({
        default: 1024
    })
    height!: number;
    @Prop()
    deviceScaleFactor?: number;
    @Prop()
    isMobile?: boolean;
    @Prop()
    isLandscape?: boolean;
    @Prop()
    hasTouch?: boolean;
}

@Also({
    openapi: {
        operation: {
            parameters: {
                'Accept': {
                    description: `Specifies your preference for the response format.\n\n` +
                        `Supported formats: \n` +
                        `- text/event-stream\n` +
                        `- application/json or text/json\n` +
                        `- text/plain`
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Cache-Tolerance': {
                    description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-No-Cache': {
                    description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Respond-With': {
                    description: `Specifies the (non-default) form of the crawled data you prefer.\n\n` +
                        `Supported formats: \n` +
                        `- markdown\n` +
                        `- html\n` +
                        `- text\n` +
                        `- pageshot\n` +
                        `- screenshot\n` +
                        `- content\n` +
                        `- any combination of the above\n` +
                        `- readerlm-v2\n` +
                        `- vlm\n\n` +
                        `Default: content\n`
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Wait-For-Selector': {
                    description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
                        'Example: `X-Wait-For-Selector: .content-block`\n'
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Target-Selector': {
                    description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
                        'Implies `X-Wait-For-Selector: (same selector)`'
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Remove-Selector': {
                    description: `Specifies a CSS selector to remove elements from the full html.\n\n` +
                        'Example `X-Remove-Selector: nav`'
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Keep-Img-Data-Url': {
                    description: `Keep data-url as it instead of transforming them to object-url. (Only applicable when targeting markdown format)\n\n` +
                        'Example `X-Keep-Img-Data-Url: true`'
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Proxy-Url': {
                    description: `Specifies your custom proxy if you prefer to use one.\n\n` +
                        `Supported protocols: \n` +
                        `- http\n` +
                        `- https\n` +
                        `- socks4\n` +
                        `- socks5\n\n` +
                        `For authentication, https://user:pass@host:port`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Proxy': {
                    description: `Use a proxy server provided by us.\n\nOptionally specify two-letter country code.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Robots-Txt': {
                    description: `Load and conform to the respective robot.txt on the target origin.\n\nOptionally specify a bot UA to check against.\n\n`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'DNT': {
                    description: `When set to 1, prevent the result of this request to be cached in the system.\n\n`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Set-Cookie': {
                    description: `Sets cookie(s) to the headless browser for your request. \n\n` +
                        `Syntax is the same with standard Set-Cookie`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-With-Generated-Alt': {
                    description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
                        `Note: Does not work when \`X-Respond-With\` is specified`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-With-Images-Summary': {
                    description: `Enable dedicated summary section for images on the page.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-With-links-Summary': {
                    description: `Enable dedicated summary section for hyper links on the page.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Retain-Images': {
                    description: `Image retention modes.\n\n` +
                        `Supported modes: \n` +
                        `- all: all images\n` +
                        `- none: no images\n` +
                        `- alt: only alt text\n` +
                        `- all_p: all images and with generated alt text\n` +
                        `- alt_p: only alt text and with generated alt\n\n`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-With-Iframe': {
                    description: `Enable filling iframe contents into main. (violates standards)`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-With-Shadow-Dom': {
                    description: `Enable filling shadow dom contents into main. (violates standards)`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-User-Agent': {
                    description: `Override User-Agent.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Timeout': {
                    description: `Specify timeout in seconds. Max 180.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Locale': {
                    description: 'Specify browser locale for the page.',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Referer': {
                    description: 'Specify referer for the page.',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Token-Budget': {
                    description: 'Specify a budget in tokens.\n\nIf the resulting token cost exceeds the budget, the request is rejected.',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Respond-Timing': {
                    description: `Explicitly specify the respond timing. One of the following:\n\n` +
                        `- html: directly return unrendered HTML\n` +
                        `- visible-content: return immediately when any content becomes available\n` +
                        `- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
                        `- resource-idle: wait for no additional resources that would affect page logic and content has SUCCEEDED loading in 0.5s\n` +
                        `- media-idle: wait for no additional resources, including media resources, has SUCCEEDED loading in 0.5s\n` +
                        `- network-idle: wait for full load of webpage, also known as networkidle0.\n\n`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Engine': {
                    description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, cf-browser-rendering',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Base': {
                    description: 'Select base modes of relative URLs.\n\nSupported: initial, final',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Md-Heading-Style': {
                    description: 'Heading style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: setext, atx',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Md-Hr': {
                    description: 'Hr text of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Md-Bullet-List-Marker': {
                    description: 'Bullet list marker of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: -, +, *',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Md-Em-Delimiter': {
                    description: 'Em delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: _, *',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Md-Strong-Delimiter': {
                    description: 'Strong delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: **, __',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Md-Link-Style': {
                    description: 'Link style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: inlined, referenced, discarded',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Md-Link-Reference-Style': {
                    description: 'Link reference style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: full, collapsed, shortcut, discarded',
                    in: 'header',
                    schema: { type: 'string' }
                },
            }
        }
    }
})
export class CrawlerOptions extends AutoCastable {

    @Prop()
    url?: string;

    @Prop()
    html?: string;

    @Prop({
        type: BASE_URL_MODE_VALUES,
        default: 'initial',
    })
    base?: typeof BASE_URL_MODES[number];

    @Prop({
        desc: 'Base64 encoded PDF.',
        type: [FancyFile, String]
    })
    pdf?: FancyFile | string;

    @Prop({
        default: CONTENT_FORMAT.CONTENT,
        type: [CONTENT_FORMAT, String]
    })
    respondWith!: string;

    @Prop({
        default: false,
    })
    withGeneratedAlt!: boolean;

    @Prop({ default: 'all', type: IMAGE_RETENTION_MODE_VALUES })
    retainImages?: typeof IMAGE_RETENTION_MODES[number];

    @Prop({
        default: false,
    })
    withLinksSummary!: boolean | string;

    @Prop({
        default: false,
    })
    withImagesSummary!: boolean;

    @Prop({
        default: false,
    })
    noCache!: boolean;

    @Prop({
        default: false,
    })
    noGfm!: string | boolean;

    @Prop()
    cacheTolerance?: number;

    @Prop({ arrayOf: String })
    targetSelector?: string | string[];

    @Prop({ arrayOf: String })
    waitForSelector?: string | string[];

    @Prop({ arrayOf: String })
    removeSelector?: string | string[];

    @Prop({
        default: false,
    })
    keepImgDataUrl!: boolean;

    @Prop({
        default: false,
        type: [String, Boolean]
    })
    withIframe!: boolean | 'quoted';

    @Prop({
        default: false,
    })
    withShadowDom!: boolean;

    @Prop({
        arrayOf: String,
    })
    setCookies?: Cookie[];

    @Prop()
    proxyUrl?: string;

    @Prop()
    proxy?: string;

    @Prop()
    userAgent?: string;

    @Prop()
    engine?: string;

    @Prop({
        arrayOf: String,
    })
    injectPageScript?: string[];

    @Prop({
        arrayOf: String,
    })
    injectFrameScript?: string[];

    @Prop({
        validate: (v: number) => v > 0 && v <= 180,
        type: Number,
        nullable: true,
    })
    timeout?: number | null;

    @Prop()
    locale?: string;

    @Prop()
    referer?: string;

    @Prop()
    tokenBudget?: number;

    @Prop()
    viewport?: Viewport;

    @Prop()
    instruction?: string;

    @Prop()
    jsonSchema?: object;

    @Prop()
    robotsTxt?: string;

    @Prop()
    doNotTrack?: number | null;

    @Prop()
    markdown?: TurnDownTweakableOptions;

    @Prop({
        type: RESPOND_TIMING,
    })
    respondTiming?: RESPOND_TIMING;

    _hintIps?: string[];

    static override from(input: any) {
        const instance = super.from(input) as CrawlerOptions;
        const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;

        const customMode = ctx?.get('x-respond-with') || ctx?.get('x-return-format');
        if (customMode) {
            instance.respondWith = customMode;
        }
        if (instance.respondWith) {
            instance.respondWith = instance.respondWith.toLowerCase();
        }
        if (instance.respondWith?.includes('lm')) {
            if (instance.respondWith.includes('content') || instance.respondWith.includes('markdown')) {
                throw new ParamValidationError({
                    path: 'respondWith',
                    message: `LM formats conflicts with content/markdown.`,
                });
            }
        }

        const locale = ctx?.get('x-locale');
        if (locale) {
            instance.locale = locale;
        }

        const referer = ctx?.get('x-referer');
        if (referer) {
            instance.referer = referer;
        }

        const withGeneratedAlt = ctx?.get('x-with-generated-alt');
        if (withGeneratedAlt) {
            instance.withGeneratedAlt = Boolean(withGeneratedAlt);
        }
        const withLinksSummary = ctx?.get('x-with-links-summary');
        if (withLinksSummary) {
            if (withLinksSummary === 'all') {
                instance.withLinksSummary = withLinksSummary;
            } else {
                instance.withLinksSummary = Boolean(withLinksSummary);
            }
        }
        const withImagesSummary = ctx?.get('x-with-images-summary');
        if (withImagesSummary) {
            instance.withImagesSummary = Boolean(withImagesSummary);
        }
        const retainImages = ctx?.get('x-retain-images');
        if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
            instance.retainImages = retainImages as any;
        }
        if (instance.withGeneratedAlt) {
            instance.retainImages = 'all_p';
        }
        const noCache = ctx?.get('x-no-cache');
        if (noCache) {
            instance.noCache = Boolean(noCache);
        }
        if (instance.noCache && instance.cacheTolerance === undefined) {
            instance.cacheTolerance = 0;
        }
        let cacheTolerance = parseInt(ctx?.get('x-cache-tolerance') || '');
        if (!isNaN(cacheTolerance)) {
            instance.cacheTolerance = cacheTolerance;
        }

        const noGfm = ctx?.get('x-no-gfm');
        if (noGfm) {
            instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm);
        }

        let timeoutSeconds = parseInt(ctx?.get('x-timeout') || '');
        if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
            instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
        } else if (ctx?.get('x-timeout')) {
            instance.timeout = null;
        }

        const removeSelector = ctx?.get('x-remove-selector')?.split(', ').filter(Boolean);
        instance.removeSelector ??= removeSelector?.length ? removeSelector : undefined;
        const targetSelector = ctx?.get('x-target-selector')?.split(', ').filter(Boolean);
        instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
        const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
        instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
        const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
        instance.userAgent ??= overrideUserAgent;

        const engine = ctx?.get('x-engine');
        if (engine) {
            instance.engine = engine;
        }
        if (instance.engine) {
            instance.engine = instance.engine.toLowerCase();
        }
        if (instance.engine === 'vlm') {
            instance.engine = ENGINE_TYPE.BROWSER;
            instance.respondWith = CONTENT_FORMAT.VLM;
        } else if (instance.engine === 'readerlm-v2') {
            instance.engine = ENGINE_TYPE.AUTO;
            instance.respondWith = CONTENT_FORMAT.READER_LM;
        }

        const keepImgDataUrl = ctx?.get('x-keep-img-data-url');
        if (keepImgDataUrl) {
            instance.keepImgDataUrl = Boolean(keepImgDataUrl);
        }
        const withIframe = ctx?.get('x-with-iframe');
        if (withIframe) {
            instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe);
        }
        if (instance.withIframe) {
            instance.timeout ??= null;
        }
        const withShadowDom = ctx?.get('x-with-shadow-dom');
        if (withShadowDom) {
            instance.withShadowDom = Boolean(withShadowDom);
        }
        if (instance.withShadowDom) {
            instance.timeout ??= null;
        }

        const cookies: Cookie[] = [];
        const setCookieHeaders = (ctx?.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[])).filter(Boolean);
        if (Array.isArray(setCookieHeaders)) {
            for (const setCookie of setCookieHeaders) {
                cookies.push({
                    ...parseSetCookieString(setCookie, { decodeValues: true }),
                });
            }
        } else if (setCookieHeaders && typeof setCookieHeaders === 'string') {
            cookies.push({
                ...parseSetCookieString(setCookieHeaders, { decodeValues: true }),
            });
        }
        instance.setCookies = cookies;

        const proxyUrl = ctx?.get('x-proxy-url');
        instance.proxyUrl ??= proxyUrl || undefined;
        const proxy = ctx?.get('x-proxy');
        instance.proxy ??= proxy || undefined;
        const robotsTxt = ctx?.get('x-robots-txt');
        instance.robotsTxt ??= robotsTxt || undefined;

        const tokenBudget = ctx?.get('x-token-budget');
        instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;

        const baseMode = ctx?.get('x-base');
        if (baseMode) {
            instance.base = baseMode as any;
        }

        const dnt = ctx?.get('dnt');
        instance.doNotTrack ??= (parseInt(dnt || '') || null);

        const respondTiming = ctx?.get('x-respond-timing');
        if (respondTiming) {
            instance.respondTiming ??= respondTiming as RESPOND_TIMING;
        }

        if (instance.cacheTolerance) {
            instance.cacheTolerance = instance.cacheTolerance * 1000;
        }

        if (ctx) {
            instance.markdown ??= TurnDownTweakableOptions.fromCtx(ctx);
        }

        return instance;
    }

    get presumedRespondTiming() {
        if (this.respondTiming) {
            return this.respondTiming;
        }
        if (this.timeout && this.timeout >= 20) {
            return RESPOND_TIMING.NETWORK_IDLE;
        }
        if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
            return RESPOND_TIMING.MEDIA_IDLE;
        }

        return RESPOND_TIMING.RESOURCE_IDLE;
    }

    isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
        if (this.waitForSelector?.length) {
            return false;
        }
        const presumedTiming = this.presumedRespondTiming;
        if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
            const now = Date.now();
            if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
                return true;
            }
        }
        if ((this.respondWith.includes('vlm') || this.respondWith.includes('pageshot')) && !snapshot.pageshot) {
            return false;
        }
        if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
            return false;
        }
        if (presumedTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) {
            const now = Date.now();
            if ((snapshot.lastContentResourceLoaded + 500) < now) {
                return true;
            }
        }
        if (this.injectFrameScript?.length || this.injectPageScript?.length) {
            return false;
        }
        if (presumedTiming === RESPOND_TIMING.VISIBLE_CONTENT && snapshot.parsed?.content) {
            return true;
        }
        if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
            return true;
        }
        if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
            return false;
        }
        if (presumedTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
            return true;
        }
        if (this.respondWith.includes('lm')) {
            return false;
        }
        if (this.withIframe) {
            return false;
        }

        return !snapshot.isIntermediate;
    }

    isCacheQueryApplicable() {
        if (this.noCache) {
            return false;
        }
        if (this.cacheTolerance === 0) {
            return false;
        }
        if (this.setCookies?.length) {
            return false;
        }
        if (this.injectFrameScript?.length || this.injectPageScript?.length) {
            return false;
        }
        if (this.viewport) {
            return false;
        }

        return true;
    }

    isRequestingCompoundContentFormat() {
        return !CONTENT_FORMAT_VALUES.has(this.respondWith);
    }

    browserIsNotRequired() {
        if (this.respondTiming && ![RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(this.respondTiming)) {
            return false;
        }
        if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
            return false;
        }
        if (this.injectFrameScript?.length || this.injectPageScript?.length) {
            return false;
        }
        if (this.waitForSelector?.length) {
            return false;
        }
        if (this.withIframe || this.withShadowDom) {
            return false;
        }
        if (this.viewport) {
            return false;
        }
        if (this.pdf) {
            return false;
        }
        if (this.html) {
            return false;
        }

        return true;
    }
}

export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
    static override from(input: any) {
        const instance = super.from({
            [RPC_CALL_ENVIRONMENT]: Reflect.get(input, RPC_CALL_ENVIRONMENT),
        }) as CrawlerOptionsHeaderOnly;

        return instance;
    }
}