nazib61 commited on
Commit
53f17c3
·
verified ·
1 Parent(s): 17e1071

Update scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +85 -53
scraper.js CHANGED
@@ -1,6 +1,83 @@
1
  const puppeteer = require('puppeteer-core');
2
 
3
- async function scrape(query, mode, orientation) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  let browser;
5
  try {
6
  browser = await puppeteer.launch({
@@ -8,61 +85,16 @@ async function scrape(query, mode, orientation) {
8
  headless: 'new',
9
  args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
10
  });
11
- const page = await browser.newPage();
12
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
13
-
14
- if (mode === "pinterest") {
15
- const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
16
- await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
17
- await page.evaluate(async () => {
18
- window.scrollBy(0, 1500);
19
- await new Promise(r => setTimeout(r, 1000));
20
- });
21
- const pinUrls = await page.evaluate(() => {
22
- return Array.from(document.querySelectorAll('a[href*="/pin/"]')).map(a => a.href);
23
- });
24
- const uniquePins = [...new Set(pinUrls)].slice(0, 30);
25
-
26
- for (const pinUrl of uniquePins) {
27
- const pPage = await browser.newPage();
28
- try {
29
- await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
30
- const content = await pPage.content();
31
- const match = content.match(/https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/);
32
- if (match) {
33
- const hashMatch = match[0].match(/([a-f0-9]{32})/);
34
- if (hashMatch) {
35
- const h = hashMatch[1];
36
- console.log(`https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`);
37
- }
38
- }
39
- } catch (e) {}
40
- await pPage.close();
41
- }
42
- }
43
- else if (mode === "pexels") {
44
- // Pexels supports orientation filter directly in the URL!
45
- const pexelsOri = orientation.toLowerCase() === "any" ? "" : orientation.toLowerCase();
46
- const searchUrl = `https://www.pexels.com/search/videos/${encodeURIComponent(query)}/?orientation=${pexelsOri}`;
47
- await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
48
-
49
- const videoUrls = await page.evaluate(() => {
50
- // Find all video source tags on Pexels search page
51
- return Array.from(document.querySelectorAll('video source'))
52
- .map(s => s.src)
53
- .filter(src => src.includes('.mp4') || src.includes('video-preview'));
54
- });
55
-
56
- videoUrls.forEach(url => console.log(url));
57
  }
58
 
59
  } catch (error) {
60
- process.stderr.write(error.message);
61
  } finally {
62
  if (browser) await browser.close();
63
  }
64
- }
65
-
66
- const args = process.argv.slice(2);
67
- // args[0] = query, args[1] = mode, args[2] = orientation
68
- scrape(args[0], args[1] || "pinterest", args[2] || "any");
 
1
  const puppeteer = require('puppeteer-core');
2
 
3
+ // --- PINTEREST LOGIC ---
4
+ async function scrapePinterest(browser, query) {
5
+ const page = await browser.newPage();
6
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
7
+
8
+ const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
9
+ await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
10
+
11
+ // Scroll to find more pins
12
+ await page.evaluate(async () => {
13
+ for (let i = 0; i < 3; i++) {
14
+ window.scrollBy(0, window.innerHeight * 2);
15
+ await new Promise(r => setTimeout(r, 1500));
16
+ }
17
+ });
18
+
19
+ const pinUrls = await page.evaluate(() => {
20
+ return Array.from(document.querySelectorAll('a[href*="/pin/"]'))
21
+ .map(a => a.href).filter(href => href.includes('/pin/'));
22
+ });
23
+
24
+ const uniquePins = [...new Set(pinUrls)].slice(0, 40); // Check 40 pins
25
+
26
+ for (const pinUrl of uniquePins) {
27
+ const pPage = await browser.newPage();
28
+ try {
29
+ await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 8000 });
30
+ const content = await pPage.content();
31
+ const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/g;
32
+ const matches = content.match(videoRegex);
33
+
34
+ if (matches) {
35
+ let rawUrl = matches[0].replace(/\\u002F/g, '/').replace(/[奖励"']/g, '');
36
+ const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
37
+ if (hashMatch) {
38
+ const h = hashMatch[1];
39
+ // Print 720p URL
40
+ process.stdout.write(`https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4\n`);
41
+ }
42
+ }
43
+ } catch (e) {}
44
+ await pPage.close();
45
+ }
46
+ }
47
+
48
+ // --- PEXELS LOGIC (FALLBACK) ---
49
+ async function scrapePexels(browser, query, orientation) {
50
+ const page = await browser.newPage();
51
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
52
+
53
+ // Convert "Any" to empty, otherwise use portrait/landscape
54
+ let orientationParam = "";
55
+ if (orientation.toLowerCase() === "portrait") orientationParam = "&orientation=portrait";
56
+ if (orientation.toLowerCase() === "landscape") orientationParam = "&orientation=landscape";
57
+
58
+ const searchUrl = `https://www.pexels.com/search/videos/${encodeURIComponent(query)}/?size=medium${orientationParam}`;
59
+
60
+ console.error(`Fallback: Searching Pexels for ${orientation}...`);
61
+ await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
62
+
63
+ // Pexels puts direct MP4 links in <source> tags inside <article>
64
+ const videoUrls = await page.evaluate(() => {
65
+ const sources = Array.from(document.querySelectorAll('article video source[type="video/mp4"]'));
66
+ return sources.map(s => s.src);
67
+ });
68
+
69
+ // Remove duplicates and print
70
+ const uniqueUrls = [...new Set(videoUrls)];
71
+ uniqueUrls.forEach(url => process.stdout.write(url + "\n"));
72
+ }
73
+
74
+ // --- MAIN CONTROLLER ---
75
+ (async () => {
76
+ const args = process.argv.slice(2);
77
+ const platform = args[0]; // 'pinterest' or 'pexels'
78
+ const query = args[1];
79
+ const orientation = args[2] || "Any";
80
+
81
  let browser;
82
  try {
83
  browser = await puppeteer.launch({
 
85
  headless: 'new',
86
  args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
87
  });
88
+
89
+ if (platform === 'pinterest') {
90
+ await scrapePinterest(browser, query);
91
+ } else if (platform === 'pexels') {
92
+ await scrapePexels(browser, query, orientation);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  }
94
 
95
  } catch (error) {
96
+ console.error(error.message);
97
  } finally {
98
  if (browser) await browser.close();
99
  }
100
+ })();