vibhansh's picture
Update main.js
add3557 verified
const { chromium } = require('playwright');
const { createClient } = require('@supabase/supabase-js');
const express = require('express');
const app = express();
app.use(express.json());
app.server = app.listen(7860, () => console.log('βœ… LandSea Engine on port 7860'));
app.server.timeout = 180000;
const supabase = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_KEY);
const MAX_WORKERS = 4;
// ── DATA QUALITY ──────────────────────────────────────────────────────────────
function cleanPhone(phone) {
if (!phone) return null;
// Remove all non-digits
const digits = phone.replace(/\D/g, '');
// Indian mobile: 10 digits starting with 6-9
if (digits.length === 10 && /^[6-9]/.test(digits)) return `+91 ${digits.slice(0,5)} ${digits.slice(5)}`;
// With country code: +91 + 10 digits
if (digits.length === 12 && digits.startsWith('91') && /^[6-9]/.test(digits[2])) {
const d = digits.slice(2);
return `+91 ${d.slice(0,5)} ${d.slice(5)}`;
}
// Landline: 8 digits (with STD code it could be 6-11)
if (digits.length >= 8 && digits.length <= 11) return phone.trim();
return null; // Invalid
}
function cleanWebsite(website) {
if (!website) return null;
// Remove social media / map links β€” not real websites
const blocked = ['facebook.com','instagram.com','twitter.com','youtube.com',
'google.com','goo.gl','maps.google','wa.me','whatsapp','linkedin.com',
'justdial.com','indiamart.com','tradeindia.com'];
if (blocked.some(b => website.includes(b))) return null;
// Must start with http
if (!website.startsWith('http')) return null;
return website;
}
function cleanName(name) {
if (!name) return null;
const bad = ['unknown','undefined','null','n/a','na','none','test'];
if (bad.includes(name.toLowerCase().trim())) return null;
if (name.length < 2) return null;
// Title case if all caps
if (name === name.toUpperCase() && name.length > 4) {
return name.split(' ').map(w => w.charAt(0) + w.slice(1).toLowerCase()).join(' ');
}
return name.trim();
}
function cleanRating(rating) {
if (!rating) return null;
const r = parseFloat(rating);
if (isNaN(r) || r < 1 || r > 5) return null;
return String(r);
}
function cleanReviews(reviews) {
if (!reviews) return null;
const n = parseInt(reviews.replace(/,/g, ''));
if (isNaN(n) || n < 0) return null;
return String(n);
}
function qualityScore(lead) {
let score = 0;
if (lead.phone) score += 30;
if (lead.address) score += 20;
if (lead.rating) score += 15;
if (lead.reviews) score += 10;
if (lead.name && lead.name.length > 3) score += 25;
return score;
}
function applyQuality(raw, category, city) {
if (!raw) return null;
const name = cleanName(raw.name);
if (!name) return null; // Drop if no valid name
const phone = cleanPhone(raw.phone);
const website = cleanWebsite(raw.website);
const rating = cleanRating(raw.rating);
const reviews = cleanReviews(raw.reviews);
const lead = {
name,
phone,
address: raw.address?.trim() || null,
website,
has_website: !!website,
rating,
reviews,
maps_url: raw.maps_url || null,
category,
city,
created_at: new Date().toISOString()
};
// Drop if quality score too low (no phone + no address + no rating)
if (qualityScore(lead) < 20) return null;
return lead;
}
// ─────────────────────────────────────────────────────────────────────────────
let activeWorkers = 0;
const queue = [];
function tryProcessQueue() {
while (queue.length > 0 && activeWorkers < MAX_WORKERS) {
const job = queue.shift();
activeWorkers++;
console.log(`🟒 Worker [${activeWorkers}/${MAX_WORKERS}]`);
runScraper(job).then(job.resolve).catch(e => { console.error('❌ Worker error:', e.message, e.stack); job.reject(e); }).finally(() => {
activeWorkers--;
tryProcessQueue();
});
}
}
app.get('/', (req, res) => res.json({ status: 'βœ… Online', workers: `${activeWorkers}/${MAX_WORKERS}`, queue: queue.length }));
app.post('/scrape', async (req, res) => {
const { category, city, token, limit = 10 } = req.body;
if (token !== process.env.AUTH_TOKEN) return res.status(401).json({ error: 'Unauthorized' });
// Duplicate check
const already = queue.find(j => j.category === category && j.city === city);
if (already) {
console.log(`⏭️ Duplicate skipped: ${category} in ${city}`);
const results = await new Promise((resolve, reject) => { already.resolve = resolve; already.reject = reject; }).catch(e => ({ error: e.message }));
if (results.error) return res.status(500).json({ error: results.error });
return res.json({ leads: results });
}
const results = await new Promise((resolve, reject) => {
queue.push({ category, city, limit: Math.min(parseInt(limit), 40), resolve, reject });
tryProcessQueue();
}).catch(e => ({ error: e.message }));
if (results.error) return res.status(500).json({ error: results.error });
res.json({ leads: results });
if (results.length > 0) {
supabase.from('leads').delete().eq('category', category).ilike('city', `%${city}%`)
.then(() => supabase.from('leads').insert(results));
}
});
async function runScraper({ category, city, limit }) {
console.log(`πŸš€ ${category} in ${city} | ${limit}`);
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
viewport: { width: 1366, height: 768 },
locale: 'en-IN',
// Block images/fonts β€” faster loading
extraHTTPHeaders: { 'Accept-Language': 'en-IN,en;q=0.9' }
});
// Block images and fonts globally
await context.route('**/*', route => {
const type = route.request().resourceType();
if (['image', 'font', 'stylesheet', 'media'].includes(type)) {
route.abort();
} else {
route.continue();
}
});
try {
const listPage = await context.newPage();
await listPage.goto(`https://www.google.com/maps/search/${encodeURIComponent(`${category} in ${city}`)}`, {
waitUntil: 'domcontentloaded', timeout: 45000
});
await listPage.waitForSelector('[role="feed"]', { timeout: 12000 }).catch(() => {});
const urls = await scrollAndCollect(listPage, limit);
await listPage.close();
console.log(`πŸ“‹ ${urls.length} URLs`);
if (urls.length === 0) return [];
// Parallel scrape β€” 5 tabs at once
const BATCH = 5;
const results = [];
for (let i = 0; i < urls.length; i += BATCH) {
const batch = urls.slice(i, i + BATCH);
const res = await Promise.all(batch.map(url => scrapeDetail(context, url, category, city)));
const cleaned = res.filter(Boolean).map(r => applyQuality(r, category, city)).filter(Boolean);
results.push(...cleaned);
console.log(` βœ… ${results.length}/${urls.length} (quality filtered)`);
}
// Final dedup by name
const seen = new Set();
const deduped = results.filter(r => {
const key = r.name.toLowerCase().trim();
if (seen.has(key)) return false;
seen.add(key);
return true;
});
console.log(`πŸ’Ž Final: ${deduped.length} quality leads`);
return deduped;
} finally {
await browser.close();
}
}
async function scrollAndCollect(page, needed) {
const urls = new Set();
let prev = 0, noNew = 0;
for (let i = 0; i < 15; i++) {
const found = await page.$$eval('a[href*="/maps/place/"]',
els => [...new Set(els.map(e => e.href).filter(h => h.includes('/maps/place/')))]
).catch(() => []);
found.forEach(u => urls.add(u));
if (urls.size >= needed) break;
if (urls.size === prev) { if (++noNew >= 3) break; } else noNew = 0;
prev = urls.size;
await page.evaluate(() => {
const f = document.querySelector('[role="feed"]');
if (f) f.scrollTop += 2000;
});
await page.waitForTimeout(1000);
}
return [...urls].slice(0, needed);
}
async function scrapeDetail(context, url, category, city) {
const page = await context.newPage();
try {
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 25000 });
await page.waitForSelector('h1', { timeout: 8000 }).catch(() => {});
await page.waitForTimeout(500);
const data = await page.evaluate(() => ({
name:
document.querySelector('h1.DUwDvf')?.innerText ||
document.querySelector('h1')?.innerText || null,
rating:
document.querySelector('span.ceNzR span')?.innerText ||
document.querySelector('span.MW4etd')?.innerText || null,
reviews:
document.querySelector('span.UY7F9')?.innerText?.replace(/[()]/g, '').trim() || null,
address:
document.querySelector('button[data-item-id="address"]')?.innerText ||
document.querySelector('[data-tooltip="Copy address"]')?.innerText || null,
phone: (
document.querySelector('button[data-item-id*="phone:tel"]') ||
document.querySelector('[data-tooltip="Copy phone number"]') ||
document.querySelector('a[href^="tel:"]')
)?.innerText?.trim() ||
document.querySelector('a[href^="tel:"]')?.getAttribute('href')?.replace('tel:', '') || null,
website: (
document.querySelector('a[data-item-id="authority"]') ||
document.querySelector('a[aria-label*="website" i]')
)?.href || null,
}));
if (!data.name || data.name.length < 2) return null;
return {
...data,
name: data.name.trim(),
has_website: !!data.website,
maps_url: url,
category, city,
created_at: new Date().toISOString()
};
} catch(e) { console.log('⚠️ Detail fail:', e.message); return null; }
finally { await page.close(); }
}