nazib61 commited on
Commit
b208db7
·
verified ·
1 Parent(s): f6e88f5

Update scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +34 -31
scraper.js CHANGED
@@ -1,14 +1,14 @@
1
  const puppeteer = require('puppeteer-core');
2
 
3
- // --- PINTEREST LOGIC ---
4
  async function scrapePinterest(browser, query) {
5
  const page = await browser.newPage();
6
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
7
-
8
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
9
  await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
10
 
11
- // Scroll to find more pins
12
  await page.evaluate(async () => {
13
  for (let i = 0; i < 3; i++) {
14
  window.scrollBy(0, window.innerHeight * 2);
@@ -36,65 +36,68 @@ async function scrapePinterest(browser, query) {
36
  const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
37
  if (hashMatch) {
38
  const h = hashMatch[1];
39
- // Print 720p URL
40
  process.stdout.write(`https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4\n`);
41
  }
42
  }
43
  } catch (e) {}
44
  await pPage.close();
45
  }
 
46
  }
47
 
48
- // --- PEXELS LOGIC (FALLBACK) ---
49
  async function scrapePexels(browser, query, orientation) {
50
  const page = await browser.newPage();
51
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
52
 
53
- // Convert "Any" to empty, otherwise use portrait/landscape
54
- let orientationParam = "";
55
- if (orientation.toLowerCase() === "portrait") orientationParam = "&orientation=portrait";
56
- if (orientation.toLowerCase() === "landscape") orientationParam = "&orientation=landscape";
57
 
58
- const searchUrl = `https://www.pexels.com/search/videos/${encodeURIComponent(query)}/?size=medium${orientationParam}`;
59
-
60
  console.error(`Fallback: Searching Pexels for ${orientation}...`);
61
- await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
62
 
63
- // Pexels puts direct MP4 links in <source> tags inside <article>
64
  const videoUrls = await page.evaluate(() => {
65
- const sources = Array.from(document.querySelectorAll('article video source[type="video/mp4"]'));
66
  return sources.map(s => s.src);
67
  });
68
 
69
- // Remove duplicates and print
70
  const uniqueUrls = [...new Set(videoUrls)];
71
- uniqueUrls.forEach(url => process.stdout.write(url + "\n"));
 
 
 
 
 
 
72
  }
73
 
74
  // --- MAIN CONTROLLER ---
75
  (async () => {
76
  const args = process.argv.slice(2);
77
- const platform = args[0]; // 'pinterest' or 'pexels'
78
  const query = args[1];
79
- const orientation = args[2] || "Any";
80
 
81
- let browser;
82
- try {
83
- browser = await puppeteer.launch({
84
- executablePath: '/usr/bin/chromium',
85
- headless: 'new',
86
- args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
87
- });
88
 
89
- if (platform === 'pinterest') {
 
90
  await scrapePinterest(browser, query);
91
- } else if (platform === 'pexels') {
92
  await scrapePexels(browser, query, orientation);
93
  }
94
-
95
- } catch (error) {
96
- console.error(error.message);
97
  } finally {
98
- if (browser) await browser.close();
99
  }
100
  })();
 
1
  const puppeteer = require('puppeteer-core');
2
 
3
+ // --- PINTEREST SCRAPER ---
4
  async function scrapePinterest(browser, query) {
5
  const page = await browser.newPage();
6
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
7
+
8
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
9
  await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
10
 
11
+ // Scroll to load more
12
  await page.evaluate(async () => {
13
  for (let i = 0; i < 3; i++) {
14
  window.scrollBy(0, window.innerHeight * 2);
 
36
  const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
37
  if (hashMatch) {
38
  const h = hashMatch[1];
39
+ // Output 720p version
40
  process.stdout.write(`https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4\n`);
41
  }
42
  }
43
  } catch (e) {}
44
  await pPage.close();
45
  }
46
+ await page.close();
47
  }
48
 
49
+ // --- PEXELS SCRAPER (FALLBACK) ---
50
  async function scrapePexels(browser, query, orientation) {
51
  const page = await browser.newPage();
52
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
53
 
54
+ // Pexels allows orientation filtering in the URL!
55
+ let pexelsUrl = `https://www.pexels.com/search/videos/${encodeURIComponent(query)}/`;
56
+ if (orientation === 'Portrait') pexelsUrl += '?orientation=portrait';
57
+ if (orientation === 'Landscape') pexelsUrl += '?orientation=landscape';
58
 
 
 
59
  console.error(`Fallback: Searching Pexels for ${orientation}...`);
60
+ await page.goto(pexelsUrl, { waitUntil: 'networkidle2', timeout: 30000 });
61
 
62
+ // Extract video links directly from the source tags
63
  const videoUrls = await page.evaluate(() => {
64
+ const sources = Array.from(document.querySelectorAll('source[type="video/mp4"]'));
65
  return sources.map(s => s.src);
66
  });
67
 
68
+ // Clean and print URLs
69
  const uniqueUrls = [...new Set(videoUrls)];
70
+ uniqueUrls.forEach(url => {
71
+ // Remove query params to get direct link
72
+ const cleanUrl = url.split('?')[0];
73
+ process.stdout.write(`${cleanUrl}\n`);
74
+ });
75
+
76
+ await page.close();
77
  }
78
 
79
  // --- MAIN CONTROLLER ---
80
  (async () => {
81
  const args = process.argv.slice(2);
82
+ const mode = args[0]; // 'pinterest' or 'pexels'
83
  const query = args[1];
84
+ const orientation = args[2] || 'Any';
85
 
86
+ const browser = await puppeteer.launch({
87
+ executablePath: '/usr/bin/chromium',
88
+ headless: 'new',
89
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
90
+ });
 
 
91
 
92
+ try {
93
+ if (mode === 'pinterest') {
94
  await scrapePinterest(browser, query);
95
+ } else if (mode === 'pexels') {
96
  await scrapePexels(browser, query, orientation);
97
  }
98
+ } catch (e) {
99
+ console.error(e);
 
100
  } finally {
101
+ await browser.close();
102
  }
103
  })();