nazib61 commited on
Commit
c892026
·
verified ·
1 Parent(s): e6c779f

Create scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +76 -0
scraper.js ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const puppeteer = require('puppeteer-core');
2
+
3
+ async function getPinterestVideo(query) {
4
+ // 1. Launch Browser using the System Chromium installed via packages.txt
5
+ const browser = await puppeteer.launch({
6
+ executablePath: '/usr/bin/chromium',
7
+ headless: 'new',
8
+ args: [
9
+ '--no-sandbox',
10
+ '--disable-setuid-sandbox',
11
+ '--disable-dev-shm-usage',
12
+ '--window-size=1920,1080'
13
+ ]
14
+ });
15
+
16
+ try {
17
+ const page = await browser.newPage();
18
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
19
+
20
+ // 2. Search Pinterest
21
+ const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
22
+ await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 20000 });
23
+
24
+ // 3. Find Pin URLs
25
+ const pinUrls = await page.evaluate(() => {
26
+ return Array.from(document.querySelectorAll('div[data-test-id="pinWrapper"] a'))
27
+ .map(a => a.href)
28
+ .filter(href => href.includes('/pin/'))
29
+ .slice(0, 10); // Limit to top 10 to be fast
30
+ });
31
+
32
+ // 4. Loop through pins to find a video
33
+ for (const pinUrl of pinUrls) {
34
+ try {
35
+ const pPage = await browser.newPage();
36
+ await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
37
+ const content = await pPage.content();
38
+ await pPage.close();
39
+
40
+ // Regex to find video file
41
+ const match = content.match(/https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/);
42
+
43
+ if (match) {
44
+ let rawUrl = match[0].replace(/\\u002F/g, '/');
45
+ // Convert to 720p URL if possible
46
+ if(rawUrl.includes('/720p/')) {
47
+ // It's already high quality
48
+ } else {
49
+ // Attempt to reconstruct high quality URL logic
50
+ const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
51
+ if (hashMatch) {
52
+ const h = hashMatch[1];
53
+ rawUrl = `https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`;
54
+ }
55
+ }
56
+
57
+ console.log(rawUrl); // <--- PRINT ONLY THE URL
58
+ await browser.close();
59
+ return;
60
+ }
61
+ } catch (e) {
62
+ // Ignore individual pin errors
63
+ }
64
+ }
65
+ console.log("ERROR: No video found");
66
+ } catch (error) {
67
+ console.log("ERROR: " + error.message);
68
+ } finally {
69
+ if(browser) await browser.close();
70
+ }
71
+ }
72
+
73
+ // Get args from Python
74
+ const args = process.argv.slice(2);
75
+ const query = args[0] || "cats video";
76
+ getPinterestVideo(query);