nazib61 commited on
Commit
c400431
·
verified ·
1 Parent(s): 6dc2268

Update scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +18 -14
scraper.js CHANGED
@@ -15,12 +15,10 @@ async function getPinterestVideos(query) {
15
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
16
  await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
17
 
18
- // --- SCROLLING LOGIC TO FIND MORE PINS ---
19
  await page.evaluate(async () => {
20
- for (let i = 0; i < 3; i++) { // Scroll 3 times to load ~60 pins
21
- window.scrollBy(0, window.innerHeight * 2);
22
- await new Promise(r => setTimeout(r, 1500));
23
- }
24
  });
25
 
26
  const pinUrls = await page.evaluate(() => {
@@ -28,33 +26,39 @@ async function getPinterestVideos(query) {
28
  .map(a => a.href).filter(href => href.includes('/pin/'));
29
  });
30
 
31
- // Get up to 60 unique pins to check
32
- const uniquePins = [...new Set(pinUrls)].slice(0, 60);
33
- console.error(`Found ${uniquePins.length} potential pins. Checking for video files...`);
34
 
35
  for (const pinUrl of uniquePins) {
36
  const pPage = await browser.newPage();
37
  try {
38
- // We go to each pin and try to find the video source
39
  await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
40
- const content = await pPage.content();
 
 
 
 
 
 
 
 
41
  const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/g;
42
- const matches = content.match(videoRegex);
43
 
44
  if (matches) {
45
  let rawUrl = matches[0].replace(/\\u002F/g, '/').replace(/[奖励"']/g, '');
46
  const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
47
  if (hashMatch) {
48
  const h = hashMatch[1];
49
- // Output the 720p URL for Python to check
50
- process.stdout.write(`https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4\n`);
 
51
  }
52
  }
53
  } catch (e) {}
54
  await pPage.close();
55
  }
56
  } catch (error) {
57
- console.error("Scraper Error: " + error.message);
58
  } finally {
59
  if (browser) await browser.close();
60
  }
 
15
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
16
  await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
17
 
18
+ // Scroll a bit
19
  await page.evaluate(async () => {
20
+ window.scrollBy(0, 1000);
21
+ await new Promise(r => setTimeout(r, 1000));
 
 
22
  });
23
 
24
  const pinUrls = await page.evaluate(() => {
 
26
  .map(a => a.href).filter(href => href.includes('/pin/'));
27
  });
28
 
29
+ const uniquePins = [...new Set(pinUrls)].slice(0, 40);
 
 
30
 
31
  for (const pinUrl of uniquePins) {
32
  const pPage = await browser.newPage();
33
  try {
 
34
  await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
35
+
36
+ const data = await pPage.evaluate(() => {
37
+ // Extract Title and Description
38
+ const title = document.querySelector('h1')?.innerText || "";
39
+ const desc = document.querySelector('div[data-test-id="main-pin-description-text"]')?.innerText || "";
40
+ const html = document.body.innerHTML;
41
+ return { title, desc, html };
42
+ });
43
+
44
  const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/g;
45
+ const matches = data.html.match(videoRegex);
46
 
47
  if (matches) {
48
  let rawUrl = matches[0].replace(/\\u002F/g, '/').replace(/[奖励"']/g, '');
49
  const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
50
  if (hashMatch) {
51
  const h = hashMatch[1];
52
+ const finalUrl = `https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`;
53
+ // Output format: URL | TITLE | DESCRIPTION
54
+ process.stdout.write(`${finalUrl}|${data.title.replace(/\n/g, ' ')}|${data.desc.replace(/\n/g, ' ')}\n`);
55
  }
56
  }
57
  } catch (e) {}
58
  await pPage.close();
59
  }
60
  } catch (error) {
61
+ process.stderr.write(error.message);
62
  } finally {
63
  if (browser) await browser.close();
64
  }