Spaces:

nazib61
/

Pin_Video

Sleeping

App Files Files Community

nazib61 commited on 26 days ago

Commit

dc87db2

verified ·

1 Parent(s): aa29a19

Update scraper.js

Browse files

Files changed (1) hide show

scraper.js +39 -18

scraper.js CHANGED Viewed

@@ -1,6 +1,6 @@
 const puppeteer = require('puppeteer-core');
-async function scrape(query) {
     let browser;
     try {
         browser = await puppeteer.launch({
@@ -12,24 +12,24 @@ async function scrape(query) {
         const page = await browser.newPage();
         await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
-        // --- ENGINE 1: PINTEREST ---
-        process.stderr.write("Checking Pinterest...\n");
         await page.goto(`https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}`, { waitUntil: 'networkidle2' });
-        let pinUrls = await page.evaluate(() => {
             return Array.from(document.querySelectorAll('a[href*="/pin/"]')).map(a => a.href);
         });
-        pinUrls = [...new Set(pinUrls)].slice(0, 15);
-        for (const url of pinUrls) {
             try {
                 const pPage = await browser.newPage();
                 await pPage.goto(url, { waitUntil: 'domcontentloaded', timeout: 10000 });
                 const content = await pPage.content();
                 const match = content.match(/https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/);
                 if (match) {
-                    const hashMatch = match[0].match(/([a-f0-9]{32})/);
-                    if (hashMatch) {
-                        const h = hashMatch[1];
                         process.stdout.write(`SOURCE_PINTEREST|https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4\n`);
                     }
                 }
@@ -37,17 +37,37 @@ async function scrape(query) {
             } catch (e) {}
         }
-        // --- ENGINE 2: PEXELS ---
-        process.stderr.write("Checking Pexels (Fallback Engine)...\n");
-        await page.goto(`https://www.pexels.com/search/videos/${encodeURIComponent(query)}/`, { waitUntil: 'networkidle2' });
-        // Pexels logic: Find all direct video sources
-        const pexelsVideos = await page.evaluate(() => {
-            const sources = Array.from(document.querySelectorAll('video source'));
-            return sources.map(s => s.src).filter(src => src.includes('mp4') || src.includes('video-files'));
         });
-        pexelsVideos.forEach(v => process.stdout.write(`SOURCE_PEXELS|${v}\n`));
     } catch (error) {
         process.stderr.write(`Error: ${error.message}\n`);
@@ -56,4 +76,5 @@ async function scrape(query) {
     }
 }
-scrape(process.argv.slice(2).join(' '));

 const puppeteer = require('puppeteer-core');
+async function scrape(query, orientation) {
     let browser;
     try {
         browser = await puppeteer.launch({
         const page = await browser.newPage();
         await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
+        // --- STEP 1: PINTEREST (Manual Filter) ---
+        process.stderr.write("Searching Pinterest...\n");
         await page.goto(`https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}`, { waitUntil: 'networkidle2' });
+        let pins = await page.evaluate(() => {
             return Array.from(document.querySelectorAll('a[href*="/pin/"]')).map(a => a.href);
         });
+        pins = [...new Set(pins)].slice(0, 10);
+        for (const url of pins) {
             try {
                 const pPage = await browser.newPage();
                 await pPage.goto(url, { waitUntil: 'domcontentloaded', timeout: 10000 });
                 const content = await pPage.content();
                 const match = content.match(/https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/);
                 if (match) {
+                    const hMatch = match[0].match(/([a-f0-9]{32})/);
+                    if (hMatch) {
+                        const h = hMatch[1];
                         process.stdout.write(`SOURCE_PINTEREST|https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4\n`);
                     }
                 }
             } catch (e) {}
         }
+        // --- STEP 2: PEXELS (Automatic Orientation Filter) ---
+        // Pexels allows: ?orientation=portrait or ?orientation=landscape
+        const pexOri = orientation.toLowerCase() === "any" ? "" : `&orientation=${orientation.toLowerCase()}`;
+        const pexUrl = `https://www.pexels.com/search/videos/${encodeURIComponent(query)}/?${pexOri}`;
+        process.stderr.write(`Searching Pexels Fallback: ${pexUrl}\n`);
+        await page.goto(pexUrl, { waitUntil: 'networkidle2', timeout: 20000 });
+        // Get the first 5 video page links
+        const videoPageLinks = await page.evaluate(() => {
+            return Array.from(document.querySelectorAll('a[href*="/video/"]'))
+                .map(a => a.href)
+                .filter(href => !href.includes('/search/'));
         });
+        const uniquePex = [...new Set(videoPageLinks)].slice(0, 5);
+        for (const vUrl of uniquePex) {
+            try {
+                const vPage = await browser.newPage();
+                await vPage.goto(vUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
+                const html = await vPage.content();
+                // Look for the source file in the page metadata or video tag
+                const pexMatch = html.match(/https:\/\/videos\.pexels\.com\/video-files\/[^\s"']+/);
+                if (pexMatch) {
+                    process.stdout.write(`SOURCE_PEXELS|${pexMatch[0].replace(/"/g, '')}\n`);
+                }
+                await vPage.close();
+            } catch (e) {}
+        }
     } catch (error) {
         process.stderr.write(`Error: ${error.message}\n`);
     }
 }
+const args = process.argv.slice(2);
+scrape(args[0], args[1] || "Any");