Spaces:
Sleeping
Sleeping
Update scraper.js
Browse files- scraper.js +62 -49
scraper.js
CHANGED
|
@@ -1,76 +1,89 @@
|
|
| 1 |
const puppeteer = require('puppeteer-core');
|
| 2 |
|
| 3 |
async function getPinterestVideo(query) {
|
| 4 |
-
|
| 5 |
-
const browser = await puppeteer.launch({
|
| 6 |
-
executablePath: '/usr/bin/chromium',
|
| 7 |
-
headless: 'new',
|
| 8 |
-
args: [
|
| 9 |
-
'--no-sandbox',
|
| 10 |
-
'--disable-setuid-sandbox',
|
| 11 |
-
'--disable-dev-shm-usage',
|
| 12 |
-
'--window-size=1920,1080'
|
| 13 |
-
]
|
| 14 |
-
});
|
| 15 |
-
|
| 16 |
try {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
const page = await browser.newPage();
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
| 21 |
const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
|
| 22 |
-
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
const pinUrls = await page.evaluate(() => {
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
.filter(href => href.includes('/pin/'))
|
| 29 |
-
.slice(0, 10); // Limit to top 10 to be fast
|
| 30 |
});
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
try {
|
| 35 |
-
|
| 36 |
-
await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout:
|
| 37 |
const content = await pPage.content();
|
| 38 |
-
await pPage.close();
|
| 39 |
-
|
| 40 |
-
// Regex to find video file
|
| 41 |
-
const match = content.match(/https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/);
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
|
| 51 |
-
if (hashMatch) {
|
| 52 |
-
const h = hashMatch[1];
|
| 53 |
-
rawUrl = `https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`;
|
| 54 |
-
}
|
| 55 |
-
}
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
await browser.close();
|
| 59 |
return;
|
| 60 |
}
|
| 61 |
} catch (e) {
|
| 62 |
-
//
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
}
|
| 65 |
-
|
|
|
|
| 66 |
} catch (error) {
|
| 67 |
console.log("ERROR: " + error.message);
|
| 68 |
} finally {
|
| 69 |
-
if(browser) await browser.close();
|
| 70 |
}
|
| 71 |
}
|
| 72 |
|
| 73 |
-
// Get args from Python
|
| 74 |
const args = process.argv.slice(2);
|
| 75 |
-
|
| 76 |
-
getPinterestVideo(query);
|
|
|
|
| 1 |
const puppeteer = require('puppeteer-core');
|
| 2 |
|
| 3 |
async function getPinterestVideo(query) {
|
| 4 |
+
let browser;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
try {
|
| 6 |
+
browser = await puppeteer.launch({
|
| 7 |
+
executablePath: '/usr/bin/chromium',
|
| 8 |
+
headless: 'new',
|
| 9 |
+
args: [
|
| 10 |
+
'--no-sandbox',
|
| 11 |
+
'--disable-setuid-sandbox',
|
| 12 |
+
'--disable-dev-shm-usage',
|
| 13 |
+
'--disable-blink-features=AutomationControlled' // Helps avoid bot detection
|
| 14 |
+
]
|
| 15 |
+
});
|
| 16 |
+
|
| 17 |
const page = await browser.newPage();
|
| 18 |
+
// Set a very realistic User Agent
|
| 19 |
+
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
|
| 20 |
+
|
| 21 |
+
console.log(`Searching for: ${query}`);
|
| 22 |
const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
|
| 23 |
+
|
| 24 |
+
// Go to search page and wait longer for content
|
| 25 |
+
await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
|
| 26 |
|
| 27 |
+
// Scroll down a bit to trigger video loading
|
| 28 |
+
await page.evaluate(() => window.scrollBy(0, 1000));
|
| 29 |
+
await new Promise(r => setTimeout(r, 2000));
|
| 30 |
+
|
| 31 |
+
// Find all Pin links
|
| 32 |
const pinUrls = await page.evaluate(() => {
|
| 33 |
+
const links = Array.from(document.querySelectorAll('a[href*="/pin/"]'));
|
| 34 |
+
return links.map(a => a.href).filter(href => href.includes('/pin/'));
|
|
|
|
|
|
|
| 35 |
});
|
| 36 |
|
| 37 |
+
if (pinUrls.length === 0) {
|
| 38 |
+
console.log("ERROR: No pins found on search page.");
|
| 39 |
+
return;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
// Check top 15 pins for a video
|
| 43 |
+
const uniquePins = [...new Set(pinUrls)].slice(0, 15);
|
| 44 |
+
|
| 45 |
+
for (const pinUrl of uniquePins) {
|
| 46 |
+
const pPage = await browser.newPage();
|
| 47 |
try {
|
| 48 |
+
// Set short timeout per pin to stay fast
|
| 49 |
+
await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
|
| 50 |
const content = await pPage.content();
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
// Pinterest stores video data in a JSON object in the HTML
|
| 53 |
+
// We look for v1.pinimg.com or any .mp4 link in the source
|
| 54 |
+
const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+\.mp4/g;
|
| 55 |
+
const matches = content.match(videoRegex);
|
| 56 |
+
|
| 57 |
+
if (matches && matches.length > 0) {
|
| 58 |
+
let bestUrl = matches[0].replace(/\\u002F/g, '/');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
// Try to force 720p if it's a hash-based URL
|
| 61 |
+
const hashMatch = bestUrl.match(/([a-f0-9]{32})/);
|
| 62 |
+
if (hashMatch && !bestUrl.includes('720p')) {
|
| 63 |
+
const h = hashMatch[1];
|
| 64 |
+
const highRes = `https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`;
|
| 65 |
+
bestUrl = highRes;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
console.log(bestUrl);
|
| 69 |
+
await pPage.close();
|
| 70 |
await browser.close();
|
| 71 |
return;
|
| 72 |
}
|
| 73 |
} catch (e) {
|
| 74 |
+
// Continue to next pin if one fails
|
| 75 |
+
} finally {
|
| 76 |
+
if (!pPage.isClosed()) await pPage.close();
|
| 77 |
}
|
| 78 |
}
|
| 79 |
+
|
| 80 |
+
console.log("ERROR: No video file found in the first 15 pins.");
|
| 81 |
} catch (error) {
|
| 82 |
console.log("ERROR: " + error.message);
|
| 83 |
} finally {
|
| 84 |
+
if (browser) await browser.close();
|
| 85 |
}
|
| 86 |
}
|
| 87 |
|
|
|
|
| 88 |
const args = process.argv.slice(2);
|
| 89 |
+
getPinterestVideo(args.join(' '));
|
|
|