const { chromium } = require('playwright'); const { createClient } = require('@supabase/supabase-js'); const express = require('express'); const app = express(); app.use(express.json()); app.server = app.listen(7860, () => console.log('✅ LandSea Engine on port 7860')); app.server.timeout = 180000; const supabase = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_KEY); const MAX_WORKERS = 4; // ── DATA QUALITY ────────────────────────────────────────────────────────────── function cleanPhone(phone) { if (!phone) return null; // Remove all non-digits const digits = phone.replace(/\D/g, ''); // Indian mobile: 10 digits starting with 6-9 if (digits.length === 10 && /^[6-9]/.test(digits)) return `+91 ${digits.slice(0,5)} ${digits.slice(5)}`; // With country code: +91 + 10 digits if (digits.length === 12 && digits.startsWith('91') && /^[6-9]/.test(digits[2])) { const d = digits.slice(2); return `+91 ${d.slice(0,5)} ${d.slice(5)}`; } // Landline: 8 digits (with STD code it could be 6-11) if (digits.length >= 8 && digits.length <= 11) return phone.trim(); return null; // Invalid } function cleanWebsite(website) { if (!website) return null; // Remove social media / map links — not real websites const blocked = ['facebook.com','instagram.com','twitter.com','youtube.com', 'google.com','goo.gl','maps.google','wa.me','whatsapp','linkedin.com', 'justdial.com','indiamart.com','tradeindia.com']; if (blocked.some(b => website.includes(b))) return null; // Must start with http if (!website.startsWith('http')) return null; return website; } function cleanName(name) { if (!name) return null; const bad = ['unknown','undefined','null','n/a','na','none','test']; if (bad.includes(name.toLowerCase().trim())) return null; if (name.length < 2) return null; // Title case if all caps if (name === name.toUpperCase() && name.length > 4) { return name.split(' ').map(w => w.charAt(0) + w.slice(1).toLowerCase()).join(' '); } return name.trim(); } function cleanRating(rating) { if (!rating) return null; const r = parseFloat(rating); if (isNaN(r) || r < 1 || r > 5) return null; return String(r); } function cleanReviews(reviews) { if (!reviews) return null; const n = parseInt(reviews.replace(/,/g, '')); if (isNaN(n) || n < 0) return null; return String(n); } function qualityScore(lead) { let score = 0; if (lead.phone) score += 30; if (lead.address) score += 20; if (lead.rating) score += 15; if (lead.reviews) score += 10; if (lead.name && lead.name.length > 3) score += 25; return score; } function applyQuality(raw, category, city) { if (!raw) return null; const name = cleanName(raw.name); if (!name) return null; // Drop if no valid name const phone = cleanPhone(raw.phone); const website = cleanWebsite(raw.website); const rating = cleanRating(raw.rating); const reviews = cleanReviews(raw.reviews); const lead = { name, phone, address: raw.address?.trim() || null, website, has_website: !!website, rating, reviews, maps_url: raw.maps_url || null, category, city, created_at: new Date().toISOString() }; // Drop if quality score too low (no phone + no address + no rating) if (qualityScore(lead) < 20) return null; return lead; } // ───────────────────────────────────────────────────────────────────────────── let activeWorkers = 0; const queue = []; function tryProcessQueue() { while (queue.length > 0 && activeWorkers < MAX_WORKERS) { const job = queue.shift(); activeWorkers++; console.log(`🟢 Worker [${activeWorkers}/${MAX_WORKERS}]`); runScraper(job).then(job.resolve).catch(e => { console.error('❌ Worker error:', e.message, e.stack); job.reject(e); }).finally(() => { activeWorkers--; tryProcessQueue(); }); } } app.get('/', (req, res) => res.json({ status: '✅ Online', workers: `${activeWorkers}/${MAX_WORKERS}`, queue: queue.length })); app.post('/scrape', async (req, res) => { const { category, city, token, limit = 10 } = req.body; if (token !== process.env.AUTH_TOKEN) return res.status(401).json({ error: 'Unauthorized' }); // Duplicate check const already = queue.find(j => j.category === category && j.city === city); if (already) { console.log(`⏭️ Duplicate skipped: ${category} in ${city}`); const results = await new Promise((resolve, reject) => { already.resolve = resolve; already.reject = reject; }).catch(e => ({ error: e.message })); if (results.error) return res.status(500).json({ error: results.error }); return res.json({ leads: results }); } const results = await new Promise((resolve, reject) => { queue.push({ category, city, limit: Math.min(parseInt(limit), 40), resolve, reject }); tryProcessQueue(); }).catch(e => ({ error: e.message })); if (results.error) return res.status(500).json({ error: results.error }); res.json({ leads: results }); if (results.length > 0) { supabase.from('leads').delete().eq('category', category).ilike('city', `%${city}%`) .then(() => supabase.from('leads').insert(results)); } }); async function runScraper({ category, city, limit }) { console.log(`🚀 ${category} in ${city} | ${limit}`); const browser = await chromium.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] }); const context = await browser.newContext({ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', viewport: { width: 1366, height: 768 }, locale: 'en-IN', // Block images/fonts — faster loading extraHTTPHeaders: { 'Accept-Language': 'en-IN,en;q=0.9' } }); // Block images and fonts globally await context.route('**/*', route => { const type = route.request().resourceType(); if (['image', 'font', 'stylesheet', 'media'].includes(type)) { route.abort(); } else { route.continue(); } }); try { const listPage = await context.newPage(); await listPage.goto(`https://www.google.com/maps/search/${encodeURIComponent(`${category} in ${city}`)}`, { waitUntil: 'domcontentloaded', timeout: 45000 }); await listPage.waitForSelector('[role="feed"]', { timeout: 12000 }).catch(() => {}); const urls = await scrollAndCollect(listPage, limit); await listPage.close(); console.log(`📋 ${urls.length} URLs`); if (urls.length === 0) return []; // Parallel scrape — 5 tabs at once const BATCH = 5; const results = []; for (let i = 0; i < urls.length; i += BATCH) { const batch = urls.slice(i, i + BATCH); const res = await Promise.all(batch.map(url => scrapeDetail(context, url, category, city))); const cleaned = res.filter(Boolean).map(r => applyQuality(r, category, city)).filter(Boolean); results.push(...cleaned); console.log(` ✅ ${results.length}/${urls.length} (quality filtered)`); } // Final dedup by name const seen = new Set(); const deduped = results.filter(r => { const key = r.name.toLowerCase().trim(); if (seen.has(key)) return false; seen.add(key); return true; }); console.log(`💎 Final: ${deduped.length} quality leads`); return deduped; } finally { await browser.close(); } } async function scrollAndCollect(page, needed) { const urls = new Set(); let prev = 0, noNew = 0; for (let i = 0; i < 15; i++) { const found = await page.$$eval('a[href*="/maps/place/"]', els => [...new Set(els.map(e => e.href).filter(h => h.includes('/maps/place/')))] ).catch(() => []); found.forEach(u => urls.add(u)); if (urls.size >= needed) break; if (urls.size === prev) { if (++noNew >= 3) break; } else noNew = 0; prev = urls.size; await page.evaluate(() => { const f = document.querySelector('[role="feed"]'); if (f) f.scrollTop += 2000; }); await page.waitForTimeout(1000); } return [...urls].slice(0, needed); } async function scrapeDetail(context, url, category, city) { const page = await context.newPage(); try { await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 25000 }); await page.waitForSelector('h1', { timeout: 8000 }).catch(() => {}); await page.waitForTimeout(500); const data = await page.evaluate(() => ({ name: document.querySelector('h1.DUwDvf')?.innerText || document.querySelector('h1')?.innerText || null, rating: document.querySelector('span.ceNzR span')?.innerText || document.querySelector('span.MW4etd')?.innerText || null, reviews: document.querySelector('span.UY7F9')?.innerText?.replace(/[()]/g, '').trim() || null, address: document.querySelector('button[data-item-id="address"]')?.innerText || document.querySelector('[data-tooltip="Copy address"]')?.innerText || null, phone: ( document.querySelector('button[data-item-id*="phone:tel"]') || document.querySelector('[data-tooltip="Copy phone number"]') || document.querySelector('a[href^="tel:"]') )?.innerText?.trim() || document.querySelector('a[href^="tel:"]')?.getAttribute('href')?.replace('tel:', '') || null, website: ( document.querySelector('a[data-item-id="authority"]') || document.querySelector('a[aria-label*="website" i]') )?.href || null, })); if (!data.name || data.name.length < 2) return null; return { ...data, name: data.name.trim(), has_website: !!data.website, maps_url: url, category, city, created_at: new Date().toISOString() }; } catch(e) { console.log('⚠️ Detail fail:', e.message); return null; } finally { await page.close(); } }