Spaces:
Running
Running
| const { chromium } = require('playwright'); | |
| const { createClient } = require('@supabase/supabase-js'); | |
| const express = require('express'); | |
| const app = express(); | |
| app.use(express.json()); | |
| app.server = app.listen(7860, () => console.log('β LandSea Engine on port 7860')); | |
| app.server.timeout = 180000; | |
| const supabase = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_KEY); | |
| const MAX_WORKERS = 4; | |
| // ββ DATA QUALITY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function cleanPhone(phone) { | |
| if (!phone) return null; | |
| // Remove all non-digits | |
| const digits = phone.replace(/\D/g, ''); | |
| // Indian mobile: 10 digits starting with 6-9 | |
| if (digits.length === 10 && /^[6-9]/.test(digits)) return `+91 ${digits.slice(0,5)} ${digits.slice(5)}`; | |
| // With country code: +91 + 10 digits | |
| if (digits.length === 12 && digits.startsWith('91') && /^[6-9]/.test(digits[2])) { | |
| const d = digits.slice(2); | |
| return `+91 ${d.slice(0,5)} ${d.slice(5)}`; | |
| } | |
| // Landline: 8 digits (with STD code it could be 6-11) | |
| if (digits.length >= 8 && digits.length <= 11) return phone.trim(); | |
| return null; // Invalid | |
| } | |
| function cleanWebsite(website) { | |
| if (!website) return null; | |
| // Remove social media / map links β not real websites | |
| const blocked = ['facebook.com','instagram.com','twitter.com','youtube.com', | |
| 'google.com','goo.gl','maps.google','wa.me','whatsapp','linkedin.com', | |
| 'justdial.com','indiamart.com','tradeindia.com']; | |
| if (blocked.some(b => website.includes(b))) return null; | |
| // Must start with http | |
| if (!website.startsWith('http')) return null; | |
| return website; | |
| } | |
| function cleanName(name) { | |
| if (!name) return null; | |
| const bad = ['unknown','undefined','null','n/a','na','none','test']; | |
| if (bad.includes(name.toLowerCase().trim())) return null; | |
| if (name.length < 2) return null; | |
| // Title case if all caps | |
| if (name === name.toUpperCase() && name.length > 4) { | |
| return name.split(' ').map(w => w.charAt(0) + w.slice(1).toLowerCase()).join(' '); | |
| } | |
| return name.trim(); | |
| } | |
| function cleanRating(rating) { | |
| if (!rating) return null; | |
| const r = parseFloat(rating); | |
| if (isNaN(r) || r < 1 || r > 5) return null; | |
| return String(r); | |
| } | |
| function cleanReviews(reviews) { | |
| if (!reviews) return null; | |
| const n = parseInt(reviews.replace(/,/g, '')); | |
| if (isNaN(n) || n < 0) return null; | |
| return String(n); | |
| } | |
| function qualityScore(lead) { | |
| let score = 0; | |
| if (lead.phone) score += 30; | |
| if (lead.address) score += 20; | |
| if (lead.rating) score += 15; | |
| if (lead.reviews) score += 10; | |
| if (lead.name && lead.name.length > 3) score += 25; | |
| return score; | |
| } | |
| function applyQuality(raw, category, city) { | |
| if (!raw) return null; | |
| const name = cleanName(raw.name); | |
| if (!name) return null; // Drop if no valid name | |
| const phone = cleanPhone(raw.phone); | |
| const website = cleanWebsite(raw.website); | |
| const rating = cleanRating(raw.rating); | |
| const reviews = cleanReviews(raw.reviews); | |
| const lead = { | |
| name, | |
| phone, | |
| address: raw.address?.trim() || null, | |
| website, | |
| has_website: !!website, | |
| rating, | |
| reviews, | |
| maps_url: raw.maps_url || null, | |
| category, | |
| city, | |
| created_at: new Date().toISOString() | |
| }; | |
| // Drop if quality score too low (no phone + no address + no rating) | |
| if (qualityScore(lead) < 20) return null; | |
| return lead; | |
| } | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| let activeWorkers = 0; | |
| const queue = []; | |
| function tryProcessQueue() { | |
| while (queue.length > 0 && activeWorkers < MAX_WORKERS) { | |
| const job = queue.shift(); | |
| activeWorkers++; | |
| console.log(`π’ Worker [${activeWorkers}/${MAX_WORKERS}]`); | |
| runScraper(job).then(job.resolve).catch(e => { console.error('β Worker error:', e.message, e.stack); job.reject(e); }).finally(() => { | |
| activeWorkers--; | |
| tryProcessQueue(); | |
| }); | |
| } | |
| } | |
| app.get('/', (req, res) => res.json({ status: 'β Online', workers: `${activeWorkers}/${MAX_WORKERS}`, queue: queue.length })); | |
| app.post('/scrape', async (req, res) => { | |
| const { category, city, token, limit = 10 } = req.body; | |
| if (token !== process.env.AUTH_TOKEN) return res.status(401).json({ error: 'Unauthorized' }); | |
| // Duplicate check | |
| const already = queue.find(j => j.category === category && j.city === city); | |
| if (already) { | |
| console.log(`βοΈ Duplicate skipped: ${category} in ${city}`); | |
| const results = await new Promise((resolve, reject) => { already.resolve = resolve; already.reject = reject; }).catch(e => ({ error: e.message })); | |
| if (results.error) return res.status(500).json({ error: results.error }); | |
| return res.json({ leads: results }); | |
| } | |
| const results = await new Promise((resolve, reject) => { | |
| queue.push({ category, city, limit: Math.min(parseInt(limit), 40), resolve, reject }); | |
| tryProcessQueue(); | |
| }).catch(e => ({ error: e.message })); | |
| if (results.error) return res.status(500).json({ error: results.error }); | |
| res.json({ leads: results }); | |
| if (results.length > 0) { | |
| supabase.from('leads').delete().eq('category', category).ilike('city', `%${city}%`) | |
| .then(() => supabase.from('leads').insert(results)); | |
| } | |
| }); | |
| async function runScraper({ category, city, limit }) { | |
| console.log(`π ${category} in ${city} | ${limit}`); | |
| const browser = await chromium.launch({ | |
| headless: true, | |
| args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] | |
| }); | |
| const context = await browser.newContext({ | |
| userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', | |
| viewport: { width: 1366, height: 768 }, | |
| locale: 'en-IN', | |
| // Block images/fonts β faster loading | |
| extraHTTPHeaders: { 'Accept-Language': 'en-IN,en;q=0.9' } | |
| }); | |
| // Block images and fonts globally | |
| await context.route('**/*', route => { | |
| const type = route.request().resourceType(); | |
| if (['image', 'font', 'stylesheet', 'media'].includes(type)) { | |
| route.abort(); | |
| } else { | |
| route.continue(); | |
| } | |
| }); | |
| try { | |
| const listPage = await context.newPage(); | |
| await listPage.goto(`https://www.google.com/maps/search/${encodeURIComponent(`${category} in ${city}`)}`, { | |
| waitUntil: 'domcontentloaded', timeout: 45000 | |
| }); | |
| await listPage.waitForSelector('[role="feed"]', { timeout: 12000 }).catch(() => {}); | |
| const urls = await scrollAndCollect(listPage, limit); | |
| await listPage.close(); | |
| console.log(`π ${urls.length} URLs`); | |
| if (urls.length === 0) return []; | |
| // Parallel scrape β 5 tabs at once | |
| const BATCH = 5; | |
| const results = []; | |
| for (let i = 0; i < urls.length; i += BATCH) { | |
| const batch = urls.slice(i, i + BATCH); | |
| const res = await Promise.all(batch.map(url => scrapeDetail(context, url, category, city))); | |
| const cleaned = res.filter(Boolean).map(r => applyQuality(r, category, city)).filter(Boolean); | |
| results.push(...cleaned); | |
| console.log(` β ${results.length}/${urls.length} (quality filtered)`); | |
| } | |
| // Final dedup by name | |
| const seen = new Set(); | |
| const deduped = results.filter(r => { | |
| const key = r.name.toLowerCase().trim(); | |
| if (seen.has(key)) return false; | |
| seen.add(key); | |
| return true; | |
| }); | |
| console.log(`π Final: ${deduped.length} quality leads`); | |
| return deduped; | |
| } finally { | |
| await browser.close(); | |
| } | |
| } | |
| async function scrollAndCollect(page, needed) { | |
| const urls = new Set(); | |
| let prev = 0, noNew = 0; | |
| for (let i = 0; i < 15; i++) { | |
| const found = await page.$$eval('a[href*="/maps/place/"]', | |
| els => [...new Set(els.map(e => e.href).filter(h => h.includes('/maps/place/')))] | |
| ).catch(() => []); | |
| found.forEach(u => urls.add(u)); | |
| if (urls.size >= needed) break; | |
| if (urls.size === prev) { if (++noNew >= 3) break; } else noNew = 0; | |
| prev = urls.size; | |
| await page.evaluate(() => { | |
| const f = document.querySelector('[role="feed"]'); | |
| if (f) f.scrollTop += 2000; | |
| }); | |
| await page.waitForTimeout(1000); | |
| } | |
| return [...urls].slice(0, needed); | |
| } | |
| async function scrapeDetail(context, url, category, city) { | |
| const page = await context.newPage(); | |
| try { | |
| await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 25000 }); | |
| await page.waitForSelector('h1', { timeout: 8000 }).catch(() => {}); | |
| await page.waitForTimeout(500); | |
| const data = await page.evaluate(() => ({ | |
| name: | |
| document.querySelector('h1.DUwDvf')?.innerText || | |
| document.querySelector('h1')?.innerText || null, | |
| rating: | |
| document.querySelector('span.ceNzR span')?.innerText || | |
| document.querySelector('span.MW4etd')?.innerText || null, | |
| reviews: | |
| document.querySelector('span.UY7F9')?.innerText?.replace(/[()]/g, '').trim() || null, | |
| address: | |
| document.querySelector('button[data-item-id="address"]')?.innerText || | |
| document.querySelector('[data-tooltip="Copy address"]')?.innerText || null, | |
| phone: ( | |
| document.querySelector('button[data-item-id*="phone:tel"]') || | |
| document.querySelector('[data-tooltip="Copy phone number"]') || | |
| document.querySelector('a[href^="tel:"]') | |
| )?.innerText?.trim() || | |
| document.querySelector('a[href^="tel:"]')?.getAttribute('href')?.replace('tel:', '') || null, | |
| website: ( | |
| document.querySelector('a[data-item-id="authority"]') || | |
| document.querySelector('a[aria-label*="website" i]') | |
| )?.href || null, | |
| })); | |
| if (!data.name || data.name.length < 2) return null; | |
| return { | |
| ...data, | |
| name: data.name.trim(), | |
| has_website: !!data.website, | |
| maps_url: url, | |
| category, city, | |
| created_at: new Date().toISOString() | |
| }; | |
| } catch(e) { console.log('β οΈ Detail fail:', e.message); return null; } | |
| finally { await page.close(); } | |
| } |