nazib61 commited on
Commit
f407186
·
verified ·
1 Parent(s): fed2287

Update scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +62 -49
scraper.js CHANGED
@@ -1,76 +1,89 @@
1
  const puppeteer = require('puppeteer-core');
2
 
3
  async function getPinterestVideo(query) {
4
- // 1. Launch Browser using the System Chromium installed via packages.txt
5
- const browser = await puppeteer.launch({
6
- executablePath: '/usr/bin/chromium',
7
- headless: 'new',
8
- args: [
9
- '--no-sandbox',
10
- '--disable-setuid-sandbox',
11
- '--disable-dev-shm-usage',
12
- '--window-size=1920,1080'
13
- ]
14
- });
15
-
16
  try {
 
 
 
 
 
 
 
 
 
 
 
17
  const page = await browser.newPage();
18
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
19
-
20
- // 2. Search Pinterest
 
21
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
22
- await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 20000 });
 
 
23
 
24
- // 3. Find Pin URLs
 
 
 
 
25
  const pinUrls = await page.evaluate(() => {
26
- return Array.from(document.querySelectorAll('div[data-test-id="pinWrapper"] a'))
27
- .map(a => a.href)
28
- .filter(href => href.includes('/pin/'))
29
- .slice(0, 10); // Limit to top 10 to be fast
30
  });
31
 
32
- // 4. Loop through pins to find a video
33
- for (const pinUrl of pinUrls) {
 
 
 
 
 
 
 
 
34
  try {
35
- const pPage = await browser.newPage();
36
- await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
37
  const content = await pPage.content();
38
- await pPage.close();
39
-
40
- // Regex to find video file
41
- const match = content.match(/https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/);
42
 
43
- if (match) {
44
- let rawUrl = match[0].replace(/\\u002F/g, '/');
45
- // Convert to 720p URL if possible
46
- if(rawUrl.includes('/720p/')) {
47
- // It's already high quality
48
- } else {
49
- // Attempt to reconstruct high quality URL logic
50
- const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
51
- if (hashMatch) {
52
- const h = hashMatch[1];
53
- rawUrl = `https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`;
54
- }
55
- }
56
 
57
- console.log(rawUrl); // <--- PRINT ONLY THE URL
 
 
 
 
 
 
 
 
 
58
  await browser.close();
59
  return;
60
  }
61
  } catch (e) {
62
- // Ignore individual pin errors
 
 
63
  }
64
  }
65
- console.log("ERROR: No video found");
 
66
  } catch (error) {
67
  console.log("ERROR: " + error.message);
68
  } finally {
69
- if(browser) await browser.close();
70
  }
71
  }
72
 
73
- // Get args from Python
74
  const args = process.argv.slice(2);
75
- const query = args[0] || "cats video";
76
- getPinterestVideo(query);
 
1
  const puppeteer = require('puppeteer-core');
2
 
3
  async function getPinterestVideo(query) {
4
+ let browser;
 
 
 
 
 
 
 
 
 
 
 
5
  try {
6
+ browser = await puppeteer.launch({
7
+ executablePath: '/usr/bin/chromium',
8
+ headless: 'new',
9
+ args: [
10
+ '--no-sandbox',
11
+ '--disable-setuid-sandbox',
12
+ '--disable-dev-shm-usage',
13
+ '--disable-blink-features=AutomationControlled' // Helps avoid bot detection
14
+ ]
15
+ });
16
+
17
  const page = await browser.newPage();
18
+ // Set a very realistic User Agent
19
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
20
+
21
+ console.log(`Searching for: ${query}`);
22
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
23
+
24
+ // Go to search page and wait longer for content
25
+ await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
26
 
27
+ // Scroll down a bit to trigger video loading
28
+ await page.evaluate(() => window.scrollBy(0, 1000));
29
+ await new Promise(r => setTimeout(r, 2000));
30
+
31
+ // Find all Pin links
32
  const pinUrls = await page.evaluate(() => {
33
+ const links = Array.from(document.querySelectorAll('a[href*="/pin/"]'));
34
+ return links.map(a => a.href).filter(href => href.includes('/pin/'));
 
 
35
  });
36
 
37
+ if (pinUrls.length === 0) {
38
+ console.log("ERROR: No pins found on search page.");
39
+ return;
40
+ }
41
+
42
+ // Check top 15 pins for a video
43
+ const uniquePins = [...new Set(pinUrls)].slice(0, 15);
44
+
45
+ for (const pinUrl of uniquePins) {
46
+ const pPage = await browser.newPage();
47
  try {
48
+ // Set short timeout per pin to stay fast
49
+ await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
50
  const content = await pPage.content();
 
 
 
 
51
 
52
+ // Pinterest stores video data in a JSON object in the HTML
53
+ // We look for v1.pinimg.com or any .mp4 link in the source
54
+ const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+\.mp4/g;
55
+ const matches = content.match(videoRegex);
56
+
57
+ if (matches && matches.length > 0) {
58
+ let bestUrl = matches[0].replace(/\\u002F/g, '/');
 
 
 
 
 
 
59
 
60
+ // Try to force 720p if it's a hash-based URL
61
+ const hashMatch = bestUrl.match(/([a-f0-9]{32})/);
62
+ if (hashMatch && !bestUrl.includes('720p')) {
63
+ const h = hashMatch[1];
64
+ const highRes = `https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`;
65
+ bestUrl = highRes;
66
+ }
67
+
68
+ console.log(bestUrl);
69
+ await pPage.close();
70
  await browser.close();
71
  return;
72
  }
73
  } catch (e) {
74
+ // Continue to next pin if one fails
75
+ } finally {
76
+ if (!pPage.isClosed()) await pPage.close();
77
  }
78
  }
79
+
80
+ console.log("ERROR: No video file found in the first 15 pins.");
81
  } catch (error) {
82
  console.log("ERROR: " + error.message);
83
  } finally {
84
+ if (browser) await browser.close();
85
  }
86
  }
87
 
 
88
  const args = process.argv.slice(2);
89
+ getPinterestVideo(args.join(' '));