File size: 1,769 Bytes
0e759d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import { logger } from "../../../lib/logger";

export async function handleCustomScraping(
  text: string,
  url: string,
): Promise<{
  scraper: string;
  url: string;
  waitAfterLoad?: number;
  pageOptions?: { scrollXPaths?: string[] };
} | null> {
  // Check for Readme Docs special case
  if (
    text.includes('<meta name="readme-deploy"') &&
    !url.includes("developers.notion.com")
  ) {
    logger.debug(
      `Special use case detected for ${url}, using Fire Engine with wait time 1000ms`,
    );
    return {
      scraper: "fire-engine",
      url: url,
      waitAfterLoad: 1000,
      pageOptions: {
        scrollXPaths: [
          '//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]',
        ],
      },
    };
  }

  // Check for Vanta security portals
  if (text.includes('<link href="https://static.vanta.com')) {
    logger.debug(
      `Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`,
    );
    return {
      scraper: "fire-engine",
      url: url,
      waitAfterLoad: 3000,
    };
  }

  // Check for Google Drive PDF links in meta tags
  const googleDriveMetaPattern =
    /<meta itemprop="url" content="(https:\/\/drive\.google\.com\/file\/d\/[^"]+)"/;
  const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
  if (googleDriveMetaMatch) {
    const url = googleDriveMetaMatch[1];
    logger.debug(`Google Drive PDF link detected: ${url}`);

    const fileIdMatch = url.match(
      /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/,
    );
    if (fileIdMatch) {
      const fileId = fileIdMatch[1];
      const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;

      return {
        scraper: "pdf",
        url: pdfUrl,
      };
    }
  }

  return null;
}