nazib61 commited on
Commit
1b61f26
·
verified ·
1 Parent(s): 6d9efef

Update scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +23 -21
scraper.js CHANGED
@@ -15,34 +15,36 @@ async function getPinterestVideos(query) {
15
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
16
  await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
17
 
18
- // Scroll a bit
19
  await page.evaluate(async () => {
20
- window.scrollBy(0, 1000);
21
- await new Promise(r => setTimeout(r, 1000));
 
 
22
  });
23
 
24
- const pinUrls = await page.evaluate(() => {
25
- return Array.from(document.querySelectorAll('a[href*="/pin/"]'))
26
- .map(a => a.href).filter(href => href.includes('/pin/'));
 
 
 
 
 
 
 
 
27
  });
28
 
29
- const uniquePins = [...new Set(pinUrls)].slice(0, 40);
 
30
 
31
- for (const pinUrl of uniquePins) {
32
  const pPage = await browser.newPage();
33
  try {
34
- await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
35
-
36
- const data = await pPage.evaluate(() => {
37
- // Extract Title and Description
38
- const title = document.querySelector('h1')?.innerText || "";
39
- const desc = document.querySelector('div[data-test-id="main-pin-description-text"]')?.innerText || "";
40
- const html = document.body.innerHTML;
41
- return { title, desc, html };
42
- });
43
-
44
  const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/g;
45
- const matches = data.html.match(videoRegex);
46
 
47
  if (matches) {
48
  let rawUrl = matches[0].replace(/\\u002F/g, '/').replace(/[奖励"']/g, '');
@@ -50,8 +52,8 @@ async function getPinterestVideos(query) {
50
  if (hashMatch) {
51
  const h = hashMatch[1];
52
  const finalUrl = `https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`;
53
- // Output format: URL | TITLE | DESCRIPTION
54
- process.stdout.write(`${finalUrl}|${data.title.replace(/\n/g, ' ')}|${data.desc.replace(/\n/g, ' ')}\n`);
55
  }
56
  }
57
  } catch (e) {}
 
15
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
16
  await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
17
 
 
18
  await page.evaluate(async () => {
19
+ for (let i = 0; i < 2; i++) {
20
+ window.scrollBy(0, window.innerHeight * 2);
21
+ await new Promise(r => setTimeout(r, 1000));
22
+ }
23
  });
24
 
25
+ const pins = await page.evaluate(() => {
26
+ const results = [];
27
+ const anchors = document.querySelectorAll('a[href*="/pin/"]');
28
+ anchors.forEach(a => {
29
+ const img = a.querySelector('img');
30
+ results.append({
31
+ url: a.href,
32
+ title: a.getAttribute('aria-label') || (img ? img.getAttribute('alt') : "")
33
+ });
34
+ });
35
+ return results;
36
  });
37
 
38
+ const uniquePins = Array.from(new Set(pins.map(p => p.url)))
39
+ .map(url => pins.find(p => p.url === url)).slice(0, 30);
40
 
41
+ for (const pin of uniquePins) {
42
  const pPage = await browser.newPage();
43
  try {
44
+ await pPage.goto(pin.url, { waitUntil: 'domcontentloaded', timeout: 8000 });
45
+ const content = await pPage.content();
 
 
 
 
 
 
 
 
46
  const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/g;
47
+ const matches = content.match(videoRegex);
48
 
49
  if (matches) {
50
  let rawUrl = matches[0].replace(/\\u002F/g, '/').replace(/[奖励"']/g, '');
 
52
  if (hashMatch) {
53
  const h = hashMatch[1];
54
  const finalUrl = `https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`;
55
+ // Output: URL|METADATA for Python to split
56
+ process.stdout.write(`${finalUrl}|${pin.title}\n`);
57
  }
58
  }
59
  } catch (e) {}