Spaces:
Sleeping
Sleeping
Update scraper.js
Browse files- scraper.js +14 -39
scraper.js
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
const puppeteer = require('puppeteer-core');
|
| 2 |
|
| 3 |
-
async function
|
| 4 |
let browser;
|
| 5 |
try {
|
| 6 |
browser = await puppeteer.launch({
|
| 7 |
executablePath: '/usr/bin/chromium',
|
| 8 |
headless: 'new',
|
| 9 |
-
args: ['--no-sandbox', '--disable-setuid-sandbox'
|
| 10 |
});
|
| 11 |
|
| 12 |
const page = await browser.newPage();
|
|
@@ -15,64 +15,39 @@ async function getPinterestVideo(query) {
|
|
| 15 |
const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
|
| 16 |
await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
|
| 17 |
|
| 18 |
-
// Get pin links
|
| 19 |
const pinUrls = await page.evaluate(() => {
|
| 20 |
return Array.from(document.querySelectorAll('a[href*="/pin/"]'))
|
| 21 |
-
.map(a => a.href)
|
| 22 |
-
.filter(href => href.includes('/pin/'));
|
| 23 |
});
|
| 24 |
|
| 25 |
const uniquePins = [...new Set(pinUrls)].slice(0, 10);
|
| 26 |
|
|
|
|
| 27 |
for (const pinUrl of uniquePins) {
|
| 28 |
const pPage = await browser.newPage();
|
| 29 |
try {
|
| 30 |
-
await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout:
|
| 31 |
const content = await pPage.content();
|
| 32 |
-
|
| 33 |
-
// 1. Look for any Pinterest video URL
|
| 34 |
const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/g;
|
| 35 |
const matches = content.match(videoRegex);
|
| 36 |
|
| 37 |
-
if (matches
|
| 38 |
-
let rawUrl = matches[0].replace(/\\u002F/g, '/').replace(/"
|
| 39 |
-
|
| 40 |
-
// 2. QUALITY BOOSTER LOGIC
|
| 41 |
-
// Pinterest stores videos in a specific hash format.
|
| 42 |
-
// We extract the 32-character MD5 hash and build the 720p path.
|
| 43 |
const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
|
| 44 |
-
|
| 45 |
if (hashMatch) {
|
| 46 |
-
const
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
const h3 = hash.substring(4, 6);
|
| 50 |
-
|
| 51 |
-
// This is the official Pinterest path for HD videos
|
| 52 |
-
const hdUrl = `https://v1.pinimg.com/videos/mc/720p/${h1}/${h2}/${h3}/${hash}.mp4`;
|
| 53 |
-
|
| 54 |
-
console.log(hdUrl); // Return the HD URL
|
| 55 |
-
} else {
|
| 56 |
-
console.log(rawUrl); // Fallback to original if hash fails
|
| 57 |
}
|
| 58 |
-
|
| 59 |
-
await pPage.close();
|
| 60 |
-
await browser.close();
|
| 61 |
-
return;
|
| 62 |
}
|
| 63 |
-
} catch (e) {
|
| 64 |
-
|
| 65 |
-
} finally {
|
| 66 |
-
if (!pPage.isClosed()) await pPage.close();
|
| 67 |
-
}
|
| 68 |
}
|
| 69 |
-
console.log("ERROR: No video found");
|
| 70 |
} catch (error) {
|
| 71 |
-
console.log("ERROR
|
| 72 |
} finally {
|
| 73 |
if (browser) await browser.close();
|
| 74 |
}
|
| 75 |
}
|
| 76 |
|
| 77 |
-
|
| 78 |
-
getPinterestVideo(args.join(' '));
|
|
|
|
| 1 |
const puppeteer = require('puppeteer-core');
|
| 2 |
|
| 3 |
+
async function getPinterestVideos(query) {
|
| 4 |
let browser;
|
| 5 |
try {
|
| 6 |
browser = await puppeteer.launch({
|
| 7 |
executablePath: '/usr/bin/chromium',
|
| 8 |
headless: 'new',
|
| 9 |
+
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
| 10 |
});
|
| 11 |
|
| 12 |
const page = await browser.newPage();
|
|
|
|
| 15 |
const searchUrl = `https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}&rs=typed`;
|
| 16 |
await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 30000 });
|
| 17 |
|
|
|
|
| 18 |
const pinUrls = await page.evaluate(() => {
|
| 19 |
return Array.from(document.querySelectorAll('a[href*="/pin/"]'))
|
| 20 |
+
.map(a => a.href).filter(href => href.includes('/pin/'));
|
|
|
|
| 21 |
});
|
| 22 |
|
| 23 |
const uniquePins = [...new Set(pinUrls)].slice(0, 10);
|
| 24 |
|
| 25 |
+
// Loop through pins and find as many video URLs as possible
|
| 26 |
for (const pinUrl of uniquePins) {
|
| 27 |
const pPage = await browser.newPage();
|
| 28 |
try {
|
| 29 |
+
await pPage.goto(pinUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
|
| 30 |
const content = await pPage.content();
|
|
|
|
|
|
|
| 31 |
const videoRegex = /https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/g;
|
| 32 |
const matches = content.match(videoRegex);
|
| 33 |
|
| 34 |
+
if (matches) {
|
| 35 |
+
let rawUrl = matches[0].replace(/\\u002F/g, '/').replace(/[奖励"']/g, '');
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
const hashMatch = rawUrl.match(/([a-f0-9]{32})/);
|
|
|
|
| 37 |
if (hashMatch) {
|
| 38 |
+
const h = hashMatch[1];
|
| 39 |
+
// Return 720p version
|
| 40 |
+
console.log(`https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
}
|
| 43 |
+
} catch (e) {}
|
| 44 |
+
await pPage.close();
|
|
|
|
|
|
|
|
|
|
| 45 |
}
|
|
|
|
| 46 |
} catch (error) {
|
| 47 |
+
console.log("ERROR");
|
| 48 |
} finally {
|
| 49 |
if (browser) await browser.close();
|
| 50 |
}
|
| 51 |
}
|
| 52 |
|
| 53 |
+
getPinterestVideos(process.argv.slice(2).join(' '));
|
|
|