nazib61 commited on
Commit
dc87db2
·
verified ·
1 Parent(s): aa29a19

Update scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +39 -18
scraper.js CHANGED
@@ -1,6 +1,6 @@
1
  const puppeteer = require('puppeteer-core');
2
 
3
- async function scrape(query) {
4
  let browser;
5
  try {
6
  browser = await puppeteer.launch({
@@ -12,24 +12,24 @@ async function scrape(query) {
12
  const page = await browser.newPage();
13
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
14
 
15
- // --- ENGINE 1: PINTEREST ---
16
- process.stderr.write("Checking Pinterest...\n");
17
  await page.goto(`https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}`, { waitUntil: 'networkidle2' });
18
- let pinUrls = await page.evaluate(() => {
19
  return Array.from(document.querySelectorAll('a[href*="/pin/"]')).map(a => a.href);
20
  });
21
- pinUrls = [...new Set(pinUrls)].slice(0, 15);
22
 
23
- for (const url of pinUrls) {
24
  try {
25
  const pPage = await browser.newPage();
26
  await pPage.goto(url, { waitUntil: 'domcontentloaded', timeout: 10000 });
27
  const content = await pPage.content();
28
  const match = content.match(/https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/);
29
  if (match) {
30
- const hashMatch = match[0].match(/([a-f0-9]{32})/);
31
- if (hashMatch) {
32
- const h = hashMatch[1];
33
  process.stdout.write(`SOURCE_PINTEREST|https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4\n`);
34
  }
35
  }
@@ -37,17 +37,37 @@ async function scrape(query) {
37
  } catch (e) {}
38
  }
39
 
40
- // --- ENGINE 2: PEXELS ---
41
- process.stderr.write("Checking Pexels (Fallback Engine)...\n");
42
- await page.goto(`https://www.pexels.com/search/videos/${encodeURIComponent(query)}/`, { waitUntil: 'networkidle2' });
 
43
 
44
- // Pexels logic: Find all direct video sources
45
- const pexelsVideos = await page.evaluate(() => {
46
- const sources = Array.from(document.querySelectorAll('video source'));
47
- return sources.map(s => s.src).filter(src => src.includes('mp4') || src.includes('video-files'));
 
 
 
 
48
  });
49
 
50
- pexelsVideos.forEach(v => process.stdout.write(`SOURCE_PEXELS|${v}\n`));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  } catch (error) {
53
  process.stderr.write(`Error: ${error.message}\n`);
@@ -56,4 +76,5 @@ async function scrape(query) {
56
  }
57
  }
58
 
59
- scrape(process.argv.slice(2).join(' '));
 
 
1
  const puppeteer = require('puppeteer-core');
2
 
3
+ async function scrape(query, orientation) {
4
  let browser;
5
  try {
6
  browser = await puppeteer.launch({
 
12
  const page = await browser.newPage();
13
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36');
14
 
15
+ // --- STEP 1: PINTEREST (Manual Filter) ---
16
+ process.stderr.write("Searching Pinterest...\n");
17
  await page.goto(`https://www.pinterest.com/search/pins/?q=${encodeURIComponent(query)}`, { waitUntil: 'networkidle2' });
18
+ let pins = await page.evaluate(() => {
19
  return Array.from(document.querySelectorAll('a[href*="/pin/"]')).map(a => a.href);
20
  });
21
+ pins = [...new Set(pins)].slice(0, 10);
22
 
23
+ for (const url of pins) {
24
  try {
25
  const pPage = await browser.newPage();
26
  await pPage.goto(url, { waitUntil: 'domcontentloaded', timeout: 10000 });
27
  const content = await pPage.content();
28
  const match = content.match(/https:\/\/v1\.pinimg\.com\/videos\/mc\/[^\s"']+/);
29
  if (match) {
30
+ const hMatch = match[0].match(/([a-f0-9]{32})/);
31
+ if (hMatch) {
32
+ const h = hMatch[1];
33
  process.stdout.write(`SOURCE_PINTEREST|https://v1.pinimg.com/videos/mc/720p/${h.substring(0,2)}/${h.substring(2,4)}/${h.substring(4,6)}/${h}.mp4\n`);
34
  }
35
  }
 
37
  } catch (e) {}
38
  }
39
 
40
+ // --- STEP 2: PEXELS (Automatic Orientation Filter) ---
41
+ // Pexels allows: ?orientation=portrait or ?orientation=landscape
42
+ const pexOri = orientation.toLowerCase() === "any" ? "" : `&orientation=${orientation.toLowerCase()}`;
43
+ const pexUrl = `https://www.pexels.com/search/videos/${encodeURIComponent(query)}/?${pexOri}`;
44
 
45
+ process.stderr.write(`Searching Pexels Fallback: ${pexUrl}\n`);
46
+ await page.goto(pexUrl, { waitUntil: 'networkidle2', timeout: 20000 });
47
+
48
+ // Get the first 5 video page links
49
+ const videoPageLinks = await page.evaluate(() => {
50
+ return Array.from(document.querySelectorAll('a[href*="/video/"]'))
51
+ .map(a => a.href)
52
+ .filter(href => !href.includes('/search/'));
53
  });
54
 
55
+ const uniquePex = [...new Set(videoPageLinks)].slice(0, 5);
56
+
57
+ for (const vUrl of uniquePex) {
58
+ try {
59
+ const vPage = await browser.newPage();
60
+ await vPage.goto(vUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
61
+ const html = await vPage.content();
62
+
63
+ // Look for the source file in the page metadata or video tag
64
+ const pexMatch = html.match(/https:\/\/videos\.pexels\.com\/video-files\/[^\s"']+/);
65
+ if (pexMatch) {
66
+ process.stdout.write(`SOURCE_PEXELS|${pexMatch[0].replace(/"/g, '')}\n`);
67
+ }
68
+ await vPage.close();
69
+ } catch (e) {}
70
+ }
71
 
72
  } catch (error) {
73
  process.stderr.write(`Error: ${error.message}\n`);
 
76
  }
77
  }
78
 
79
+ const args = process.argv.slice(2);
80
+ scrape(args[0], args[1] || "Any");