Spaces:
Running
Running
| const puppeteer = require('puppeteer'); | |
| const { getDb } = require('../db/database'); | |
| const crypto = require('crypto'); | |
| async function delay(ms) { | |
| return new Promise(resolve => setTimeout(resolve, ms)); | |
| } | |
| async function scrapeGoogleMaps(query, category, city, userLocation = null) { | |
| if (userLocation) { | |
| console.log(`Starting LOCAL web scrape for: "${query}" near coordinates ${userLocation.lat}, ${userLocation.lng}...`); | |
| } else { | |
| console.log(`Starting web scrape for: "${query}" in ${city || 'unknown region'}...`); | |
| } | |
| const db = await getDb(); | |
| // Launch headless browser | |
| const browser = await puppeteer.launch({ | |
| headless: "new", | |
| args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080'] | |
| }); | |
| const page = await browser.newPage(); | |
| await page.setViewport({ width: 1920, height: 1080 }); | |
| // Set a normal user agent to bypass basic bot detection | |
| await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); | |
| try { | |
| let url; | |
| if (userLocation && userLocation.lat && userLocation.lng) { | |
| // Anchor search around the specific GPS coordinates with a 15z zoom level | |
| url = `https://www.google.com/maps/search/${encodeURIComponent(query)}/@${userLocation.lat},${userLocation.lng},15z`; | |
| } else { | |
| // Fallback to standard text search | |
| url = `https://www.google.com/maps/search/${encodeURIComponent(query)}/`; | |
| } | |
| console.log(`Navigating to: ${url}`); | |
| await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); | |
| console.log('Waiting for search results to render...'); | |
| await delay(5000); // Give Maps time to load the dynamic React list | |
| // Extract data directly from the DOM using page.evaluate | |
| const results = await page.evaluate(() => { | |
| const places = []; | |
| // Target the result container cards | |
| const elements = document.querySelectorAll('div[role="article"]'); | |
| elements.forEach(el => { | |
| const linkEl = el.querySelector('a[href*="/maps/place/"]'); | |
| if (!linkEl) return; | |
| const url = linkEl.href; | |
| const name = el.getAttribute('aria-label') || ''; | |
| if (!name) return; | |
| const textTokens = el.innerText.split('\n').map(t => t.trim()).filter(Boolean); | |
| let rating = 'N/A'; | |
| let rawCategory = ''; | |
| let address = ''; | |
| // Extract Rating (e.g. "4.6(1,011)" or "4.6") from the text tokens | |
| for (const token of textTokens) { | |
| const ratingMatch = token.match(/^(\d+\.\d+)(?:\(|\s|$)/); | |
| if (ratingMatch) { | |
| rating = ratingMatch[1]; | |
| break; | |
| } | |
| } | |
| let priceRange = 'N/A'; | |
| // Extract Price Range (usually containing ₹, $, £, €) | |
| for (const token of textTokens) { | |
| const priceMatch = token.match(/([₹$£€]+)/); | |
| if (priceMatch) { | |
| priceRange = priceMatch[1]; | |
| break; | |
| } | |
| } | |
| // Extract Category and Address | |
| // Google Maps cards usually have: Name -> Rating/Price/Category -> Address | |
| for (let i = 0; i < textTokens.length; i++) { | |
| const token = textTokens[i]; | |
| // Category usually follows rating or has a bullet | |
| if (token.includes('·')) { | |
| const parts = token.split('·'); | |
| rawCategory = parts[0].trim(); | |
| // If there is more, it might be address hint or busy-ness | |
| } | |
| // Address usually contains numbers and doesn't match name/category/rating | |
| if (token.match(/\d+/) && token !== rating && token.length > 10 && !token.includes('(')) { | |
| address = token; | |
| } | |
| } | |
| if (name && !places.some(p => p.name === name)) { | |
| places.push({ | |
| name: name, | |
| rawCategory: rawCategory, | |
| address: address, // Real address extracted from card | |
| rating: rating, | |
| priceRange: priceRange, | |
| url: url | |
| }); | |
| } | |
| }); | |
| // Capture more than just the absolute top results so that | |
| // popular chains and nearby options have a better chance | |
| // of being included in our custom database. | |
| return places.slice(0, 12); // Take top 12 | |
| }); | |
| console.log(`Successfully scraped ${results.length} raw locations.`); | |
| if (results.length === 0) { | |
| console.log("No results found on the first page. Note: Google Maps might be blocking headless requests or changing DOM selectors."); | |
| await browser.close(); | |
| return; | |
| } | |
| // Phase 13: Deep Review Extraction | |
| console.log('Navigating to physical place pages to scrape user reviews... (This will take longer due to active loading requests)'); | |
| for (let i = 0; i < results.length; i++) { | |
| const place = results[i]; | |
| console.log(`[${i + 1}/${results.length}] Deep Scraping: ${place.name}`); | |
| try { | |
| const detailPage = await browser.newPage(); | |
| await detailPage.setViewport({ width: 1920, height: 1080 }); | |
| await detailPage.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); | |
| // Allow network requests to settle so the dynamic review div renders | |
| await detailPage.goto(place.url, { waitUntil: 'networkidle2', timeout: 35000 }); | |
| // Maps often requires a brief physical pause to hydrate React children components fully | |
| await delay(3000); | |
| // Extract up to 3 text reviews | |
| const extractedReviews = await detailPage.evaluate(() => { | |
| const reviewElements = Array.from(document.querySelectorAll('.wiI7pd')); | |
| // wiI7pd is the highly-stable Google Maps class for body text of user reviews | |
| return reviewElements | |
| .map(el => el.textContent.trim()) | |
| .filter(text => text.length > 10) // Ignore "Good." or empty strings | |
| .slice(0, 3); | |
| }); | |
| place.reviews = extractedReviews; | |
| // CRITICAL: Capture the final URL after navigation/redirection. | |
| // This URL usually contains the @lat,lon coordinates needed for spatial search. | |
| const finalUrl = detailPage.url(); | |
| if (finalUrl.includes('@')) { | |
| place.url = finalUrl; | |
| } | |
| console.log(` -> Harvested ${extractedReviews.length} reviews and updated URL.`); | |
| await detailPage.close(); | |
| } catch (deepScrapeErr) { | |
| console.error(` -> Failed to deep scrape ${place.name}: ${deepScrapeErr.message}`); | |
| place.reviews = []; | |
| } | |
| } | |
| // Clean and prepare the scraped data for our DB | |
| console.log('Inserting scraped data into custom SQLite database...'); | |
| let insertedCount = 0; | |
| for (const place of results) { | |
| const id = `gmaps-${crypto.createHash('md5').update(place.name).digest('hex').substring(0, 10)}`; | |
| const realAddress = place.address || `${place.name}, ${city}`; | |
| // Actual data from extraction | |
| const rating = place.rating || 'N/A'; | |
| const priceRange = place.priceRange || 'N/A'; | |
| // Pass the raw category to the AI ranker via the features array | |
| const featuresArray = ['Scraped from Maps']; | |
| if (place.rawCategory) { | |
| featuresArray.push(`Actual Map Type: ${place.rawCategory}`); | |
| } | |
| const mockFeatures = JSON.stringify(featuresArray); | |
| // Phase 13 stringification | |
| const stringifiedReviews = JSON.stringify(place.reviews || []); | |
| // Extract lat/lng from the URL if possible | |
| let lat = null, lon = null; | |
| const coordMatch = place.url.match(/@(-?\d+\.\d+),(-?\d+\.\d+)/); | |
| if (coordMatch) { | |
| lat = parseFloat(coordMatch[1]); | |
| lon = parseFloat(coordMatch[2]); | |
| } | |
| try { | |
| await db.run(` | |
| INSERT OR REPLACE INTO places | |
| (id, name, category, address, city, rating, priceRange, features, reviews, lat, lon) | |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| `, [id, place.name, category, realAddress, city, rating, priceRange, mockFeatures, stringifiedReviews, lat, lon]); | |
| insertedCount++; | |
| console.log(` ✅ Saved: ${place.name} ${lat ? `(${lat}, ${lon})` : ''}`); | |
| } catch (err) { | |
| console.error(` ❌ Failed to save ${place.name}:`, err.message); | |
| } | |
| } | |
| console.log(`\n🎉 Scraper finished! Saved ${insertedCount} locations to custom database.`); | |
| return true; | |
| } catch (error) { | |
| console.error('Fatal Web Scraping Error:', error); | |
| throw error; // Let the calling API route handle the error state | |
| } finally { | |
| if (browser) await browser.close(); | |
| } | |
| } | |
| // Allow running from CLI directly: node src/scripts/scraper.js "cafe in bhubaneswar" cafe bhubaneswar | |
| if (require.main === module) { | |
| const args = process.argv.slice(2); | |
| if (args.length < 3) { | |
| console.log("Usage: node scraper.js <search_query> <category> <city> [lat] [lng]"); | |
| console.log('Example: node scraper.js "best cafe" cafe patia 20.2961 85.8245'); | |
| process.exit(1); | |
| } | |
| const loc = (args[3] && args[4]) ? { lat: args[3], lng: args[4] } : null; | |
| scrapeGoogleMaps(args[0], args[1], args[2], loc).then(() => { | |
| console.log("Database connection closed."); | |
| process.exit(0); | |
| }); | |
| } | |
| module.exports = scrapeGoogleMaps; | |