nazib61 commited on
Commit
095adeb
·
verified ·
1 Parent(s): 10bb8e0

Update scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +34 -32
scraper.js CHANGED
@@ -15,44 +15,46 @@ async function getPinterestVideos(query) {
15
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
16
  await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
17
 
18
- // Scroll deeply to get ~100 pins
19
- for (let i = 0; i < 5; i++) {
20
- await page.evaluate(() => window.scrollBy(0, 2000));
21
- await new Promise(r => setTimeout(r, 1000));
22
- }
 
 
23
 
24
  const pinUrls = await page.evaluate(() => {
25
- const anchors = Array.from(document.querySelectorAll('a[href*="/pin/"]'));
26
- return [...new Set(anchors.map(a => a.href))];
27
  });
28
 
29
- const limit = 80;
30
- const selectedPins = pinUrls.slice(0, limit);
31
- console.error(`Scraped ${selectedPins.length} pins. Extracting video data...`);
32
-
33
- // Process pins in small batches to avoid crashing the Hugging Face CPU
34
- for (let i = 0; i < selectedPins.length; i += 5) {
35
- const batch = selectedPins.slice(i, i + 5);
36
- await Promise.all(batch.map(async (url) => {
37
- const p = await browser.newPage();
38
- try {
39
- await p.goto(url, { waitUntil: 'domcontentloaded', timeout: 8000 });
40
- const html = await p.content();
41
- const match = html.match(/https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/);
42
- if (match) {
43
- let raw = match[0].replace(/\\u002F/g, '/').split('"')[0].split("'")[0];
44
- const hMatch = raw.match(/([a-f0-9]{32})/);
45
- if (hMatch) {
46
- const h = hMatch[1];
47
- const hd = `https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`;
48
- process.stdout.write(hd + "\n");
49
- }
50
  }
51
- } catch (e) {} finally { await p.close(); }
52
- }));
 
53
  }
54
- } catch (err) {
55
- console.error(err);
56
  } finally {
57
  if (browser) await browser.close();
58
  }
 
15
  const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
16
  await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
17
 
18
+ // --- SCROLLING LOGIC TO FIND MORE PINS ---
19
+ await page.evaluate(async () => {
20
+ for (let i = 0; i < 3; i++) { // Scroll 3 times to load ~60 pins
21
+ window.scrollBy(0, window.innerHeight * 2);
22
+ await new Promise(r => setTimeout(r, 1500));
23
+ }
24
+ });
25
 
26
  const pinUrls = await page.evaluate(() => {
27
+ return Array.from(document.querySelectorAll('a[href*="/pin/"]'))
28
+ .map(a => a.href).filter(href => href.includes('/pin/'));
29
  });
30
 
31
+ // Get up to 60 unique pins to check
32
+ const uniquePins = [...new Set(pinUrls)].slice(0, 60);
33
+ console.error(`Found ${uniquePins.length} potential pins. Checking for video files...`);
34
+
35
+ for (const pinUrl of uniquePins) {
36
+ const pPage = await browser.newPage();
37
+ try {
38
+ // We go to each pin and try to find the video source
39
+ await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
40
+ const content = await pPage.content();
41
+ const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/g;
42
+ const matches = content.match(videoRegex);
43
+
44
+ if (matches) {
45
+ let rawUrl = matches[0].replace(/\\u002F/g, '/').replace(/[奖励"']/g, '');
46
+ const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
47
+ if (hashMatch) {
48
+ const h = hashMatch[1];
49
+ // Output the 720p URL for Python to check
50
+ process.stdout.write(`https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4\n`);
 
51
  }
52
+ }
53
+ } catch (e) {}
54
+ await pPage.close();
55
  }
56
+ } catch (error) {
57
+ console.error("Scraper Error: " + error.message);
58
  } finally {
59
  if (browser) await browser.close();
60
  }