Spaces:
Running
Running
File size: 10,584 Bytes
0dd2082 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | const puppeteer = require('puppeteer');
const { getDb } = require('../db/database');
const crypto = require('crypto');
async function delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function scrapeGoogleMaps(query, category, city, userLocation = null) {
if (userLocation) {
console.log(`Starting LOCAL web scrape for: "${query}" near coordinates ${userLocation.lat}, ${userLocation.lng}...`);
} else {
console.log(`Starting web scrape for: "${query}" in ${city || 'unknown region'}...`);
}
const db = await getDb();
// Launch headless browser
const browser = await puppeteer.launch({
headless: "new",
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
// Set a normal user agent to bypass basic bot detection
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
try {
let url;
if (userLocation && userLocation.lat && userLocation.lng) {
// Anchor search around the specific GPS coordinates with a 15z zoom level
url = `https://www.google.com/maps/search/${encodeURIComponent(query)}/@${userLocation.lat},${userLocation.lng},15z`;
} else {
// Fallback to standard text search
url = `https://www.google.com/maps/search/${encodeURIComponent(query)}/`;
}
console.log(`Navigating to: ${url}`);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
console.log('Waiting for search results to render...');
await delay(5000); // Give Maps time to load the dynamic React list
// Extract data directly from the DOM using page.evaluate
const results = await page.evaluate(() => {
const places = [];
// Target the result container cards
const elements = document.querySelectorAll('div[role="article"]');
elements.forEach(el => {
const linkEl = el.querySelector('a[href*="/maps/place/"]');
if (!linkEl) return;
const url = linkEl.href;
const name = el.getAttribute('aria-label') || '';
if (!name) return;
const textTokens = el.innerText.split('\n').map(t => t.trim()).filter(Boolean);
let rating = 'N/A';
let rawCategory = '';
let address = '';
// Extract Rating (e.g. "4.6(1,011)" or "4.6") from the text tokens
for (const token of textTokens) {
const ratingMatch = token.match(/^(\d+\.\d+)(?:\(|\s|$)/);
if (ratingMatch) {
rating = ratingMatch[1];
break;
}
}
let priceRange = 'N/A';
// Extract Price Range (usually containing ₹, $, £, €)
for (const token of textTokens) {
const priceMatch = token.match(/([₹$£€]+)/);
if (priceMatch) {
priceRange = priceMatch[1];
break;
}
}
// Extract Category and Address
// Google Maps cards usually have: Name -> Rating/Price/Category -> Address
for (let i = 0; i < textTokens.length; i++) {
const token = textTokens[i];
// Category usually follows rating or has a bullet
if (token.includes('·')) {
const parts = token.split('·');
rawCategory = parts[0].trim();
// If there is more, it might be address hint or busy-ness
}
// Address usually contains numbers and doesn't match name/category/rating
if (token.match(/\d+/) && token !== rating && token.length > 10 && !token.includes('(')) {
address = token;
}
}
if (name && !places.some(p => p.name === name)) {
places.push({
name: name,
rawCategory: rawCategory,
address: address, // Real address extracted from card
rating: rating,
priceRange: priceRange,
url: url
});
}
});
// Capture more than just the absolute top results so that
// popular chains and nearby options have a better chance
// of being included in our custom database.
return places.slice(0, 12); // Take top 12
});
console.log(`Successfully scraped ${results.length} raw locations.`);
if (results.length === 0) {
console.log("No results found on the first page. Note: Google Maps might be blocking headless requests or changing DOM selectors.");
await browser.close();
return;
}
// Phase 13: Deep Review Extraction
console.log('Navigating to physical place pages to scrape user reviews... (This will take longer due to active loading requests)');
for (let i = 0; i < results.length; i++) {
const place = results[i];
console.log(`[${i + 1}/${results.length}] Deep Scraping: ${place.name}`);
try {
const detailPage = await browser.newPage();
await detailPage.setViewport({ width: 1920, height: 1080 });
await detailPage.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// Allow network requests to settle so the dynamic review div renders
await detailPage.goto(place.url, { waitUntil: 'networkidle2', timeout: 35000 });
// Maps often requires a brief physical pause to hydrate React children components fully
await delay(3000);
// Extract up to 3 text reviews
const extractedReviews = await detailPage.evaluate(() => {
const reviewElements = Array.from(document.querySelectorAll('.wiI7pd'));
// wiI7pd is the highly-stable Google Maps class for body text of user reviews
return reviewElements
.map(el => el.textContent.trim())
.filter(text => text.length > 10) // Ignore "Good." or empty strings
.slice(0, 3);
});
place.reviews = extractedReviews;
// CRITICAL: Capture the final URL after navigation/redirection.
// This URL usually contains the @lat,lon coordinates needed for spatial search.
const finalUrl = detailPage.url();
if (finalUrl.includes('@')) {
place.url = finalUrl;
}
console.log(` -> Harvested ${extractedReviews.length} reviews and updated URL.`);
await detailPage.close();
} catch (deepScrapeErr) {
console.error(` -> Failed to deep scrape ${place.name}: ${deepScrapeErr.message}`);
place.reviews = [];
}
}
// Clean and prepare the scraped data for our DB
console.log('Inserting scraped data into custom SQLite database...');
let insertedCount = 0;
for (const place of results) {
const id = `gmaps-${crypto.createHash('md5').update(place.name).digest('hex').substring(0, 10)}`;
const realAddress = place.address || `${place.name}, ${city}`;
// Actual data from extraction
const rating = place.rating || 'N/A';
const priceRange = place.priceRange || 'N/A';
// Pass the raw category to the AI ranker via the features array
const featuresArray = ['Scraped from Maps'];
if (place.rawCategory) {
featuresArray.push(`Actual Map Type: ${place.rawCategory}`);
}
const mockFeatures = JSON.stringify(featuresArray);
// Phase 13 stringification
const stringifiedReviews = JSON.stringify(place.reviews || []);
// Extract lat/lng from the URL if possible
let lat = null, lon = null;
const coordMatch = place.url.match(/@(-?\d+\.\d+),(-?\d+\.\d+)/);
if (coordMatch) {
lat = parseFloat(coordMatch[1]);
lon = parseFloat(coordMatch[2]);
}
try {
await db.run(`
INSERT OR REPLACE INTO places
(id, name, category, address, city, rating, priceRange, features, reviews, lat, lon)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`, [id, place.name, category, realAddress, city, rating, priceRange, mockFeatures, stringifiedReviews, lat, lon]);
insertedCount++;
console.log(` ✅ Saved: ${place.name} ${lat ? `(${lat}, ${lon})` : ''}`);
} catch (err) {
console.error(` ❌ Failed to save ${place.name}:`, err.message);
}
}
console.log(`\n🎉 Scraper finished! Saved ${insertedCount} locations to custom database.`);
return true;
} catch (error) {
console.error('Fatal Web Scraping Error:', error);
throw error; // Let the calling API route handle the error state
} finally {
if (browser) await browser.close();
}
}
// Allow running from CLI directly: node src/scripts/scraper.js "cafe in bhubaneswar" cafe bhubaneswar
if (require.main === module) {
const args = process.argv.slice(2);
if (args.length < 3) {
console.log("Usage: node scraper.js <search_query> <category> <city> [lat] [lng]");
console.log('Example: node scraper.js "best cafe" cafe patia 20.2961 85.8245');
process.exit(1);
}
const loc = (args[3] && args[4]) ? { lat: args[3], lng: args[4] } : null;
scrapeGoogleMaps(args[0], args[1], args[2], loc).then(() => {
console.log("Database connection closed.");
process.exit(0);
});
}
module.exports = scrapeGoogleMaps;
|