nazib61 commited on
Commit
41f915d
·
verified ·
1 Parent(s): 0267b8a

Update scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +20 -31
scraper.js CHANGED
@@ -15,11 +15,11 @@ async function getPinterestVideos(query) {
15
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
16
  await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
17
 
18
- // Scroll to load ~40-60 pins
19
  await page.evaluate(async () => {
20
- for (let i = 0; i < 2; i++) {
21
  window.scrollBy(0, window.innerHeight * 2);
22
- await new Promise(r => setTimeout(r, 1000));
23
  }
24
  });
25
 
@@ -28,44 +28,33 @@ async function getPinterestVideos(query) {
28
  .map(a => a.href).filter(href => href.includes('/pin/'));
29
  });
30
 
31
- const uniquePins = [...new Set(pinUrls)].slice(0, 40);
 
 
32
 
33
  for (const pinUrl of uniquePins) {
34
  const pPage = await browser.newPage();
35
  try {
 
36
  await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
37
-
38
- // Extract high-quality metadata from the __PWS_DATA__ script tag
39
- const videoData = await pPage.evaluate(() => {
40
- const script = document.getElementById('__PWS_DATA__');
41
- if (!script) return null;
42
- const data = JSON.parse(script.innerText);
43
- const pinId = Object.keys(data.props.initialReduxState.pins)[0];
44
- const pin = data.props.initialReduxState.pins[pinId];
45
-
46
- if (pin && pin.videos && pin.videos.video_list) {
47
- const list = pin.videos.video_list;
48
- // Pick the best MP4 or HLS variant
49
- // Priority: V_720P > V_HLSV4 > others
50
- const best = list.V_720P || list.V_HLSV4 || Object.values(list)[0];
51
- return {
52
- url: best.url,
53
- width: best.width,
54
- height: best.height
55
- };
56
  }
57
- return null;
58
- });
59
-
60
- if (videoData) {
61
- // Output format: URL | WIDTH | HEIGHT
62
- process.stdout.write(`${videoData.url}|${videoData.width}|${videoData.height}\n`);
63
  }
64
- } catch (e) {}
65
  await pPage.close();
66
  }
67
  } catch (error) {
68
- console.error(error);
69
  } finally {
70
  if (browser) await browser.close();
71
  }
 
15
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
16
  await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
17
 
18
+ // --- SCROLLING LOGIC TO FIND MORE PINS ---
19
  await page.evaluate(async () => {
20
+ for (let i = 0; i < 3; i++) { // Scroll 3 times to load ~60 pins
21
  window.scrollBy(0, window.innerHeight * 2);
22
+ await new Promise(r => setTimeout(r, 1500));
23
  }
24
  });
25
 
 
28
  .map(a => a.href).filter(href => href.includes('/pin/'));
29
  });
30
 
31
+ // Get up to 60 unique pins to check
32
+ const uniquePins = [...new Set(pinUrls)].slice(0, 60);
33
+ console.error(`Found ${uniquePins.length} potential pins. Checking for video files...`);
34
 
35
  for (const pinUrl of uniquePins) {
36
  const pPage = await browser.newPage();
37
  try {
38
+ // We go to each pin and try to find the video source
39
  await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
40
+ const content = await pPage.content();
41
+ const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/g;
42
+ const matches = content.match(videoRegex);
43
+
44
+ if (matches) {
45
+ let rawUrl = matches[0].replace(/\\u002F/g, '/').replace(/[奖励"']/g, '');
46
+ const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
47
+ if (hashMatch) {
48
+ const h = hashMatch[1];
49
+ // Output the 720p URL for Python to check
50
+ process.stdout.write(`https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4\n`);
 
 
 
 
 
 
 
 
51
  }
 
 
 
 
 
 
52
  }
53
+ } catch (e) {}
54
  await pPage.close();
55
  }
56
  } catch (error) {
57
+ console.error("Scraper Error: " + error.message);
58
  } finally {
59
  if (browser) await browser.close();
60
  }