File size: 10,584 Bytes
0dd2082
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
const puppeteer = require('puppeteer');
const { getDb } = require('../db/database');
const crypto = require('crypto');

async function delay(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
}

async function scrapeGoogleMaps(query, category, city, userLocation = null) {
    if (userLocation) {
        console.log(`Starting LOCAL web scrape for: "${query}" near coordinates ${userLocation.lat}, ${userLocation.lng}...`);
    } else {
        console.log(`Starting web scrape for: "${query}" in ${city || 'unknown region'}...`);
    }

    const db = await getDb();

    // Launch headless browser
    const browser = await puppeteer.launch({
        headless: "new",
        args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
    });

    const page = await browser.newPage();
    await page.setViewport({ width: 1920, height: 1080 });

    // Set a normal user agent to bypass basic bot detection
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');

    try {
        let url;
        if (userLocation && userLocation.lat && userLocation.lng) {
            // Anchor search around the specific GPS coordinates with a 15z zoom level
            url = `https://www.google.com/maps/search/${encodeURIComponent(query)}/@${userLocation.lat},${userLocation.lng},15z`;
        } else {
            // Fallback to standard text search
            url = `https://www.google.com/maps/search/${encodeURIComponent(query)}/`;
        }

        console.log(`Navigating to: ${url}`);
        await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });

        console.log('Waiting for search results to render...');
        await delay(5000); // Give Maps time to load the dynamic React list

        // Extract data directly from the DOM using page.evaluate
        const results = await page.evaluate(() => {
            const places = [];
            // Target the result container cards
            const elements = document.querySelectorAll('div[role="article"]');

            elements.forEach(el => {
                const linkEl = el.querySelector('a[href*="/maps/place/"]');
                if (!linkEl) return;

                const url = linkEl.href;
                const name = el.getAttribute('aria-label') || '';

                if (!name) return;

                const textTokens = el.innerText.split('\n').map(t => t.trim()).filter(Boolean);

                let rating = 'N/A';
                let rawCategory = '';
                let address = '';

                // Extract Rating (e.g. "4.6(1,011)" or "4.6") from the text tokens
                for (const token of textTokens) {
                    const ratingMatch = token.match(/^(\d+\.\d+)(?:\(|\s|$)/);
                    if (ratingMatch) {
                        rating = ratingMatch[1];
                        break;
                    }
                }

                let priceRange = 'N/A';

                // Extract Price Range (usually containing ₹, $, £, €)
                for (const token of textTokens) {
                    const priceMatch = token.match(/([₹$£€]+)/);
                    if (priceMatch) {
                        priceRange = priceMatch[1];
                        break;
                    }
                }

                // Extract Category and Address
                // Google Maps cards usually have: Name -> Rating/Price/Category -> Address
                for (let i = 0; i < textTokens.length; i++) {
                    const token = textTokens[i];

                    // Category usually follows rating or has a bullet
                    if (token.includes('·')) {
                        const parts = token.split('·');
                        rawCategory = parts[0].trim();
                        // If there is more, it might be address hint or busy-ness
                    }

                    // Address usually contains numbers and doesn't match name/category/rating
                    if (token.match(/\d+/) && token !== rating && token.length > 10 && !token.includes('(')) {
                        address = token;
                    }
                }

                if (name && !places.some(p => p.name === name)) {
                    places.push({
                        name: name,
                        rawCategory: rawCategory,
                        address: address, // Real address extracted from card
                        rating: rating,
                        priceRange: priceRange,
                        url: url
                    });
                }
            });

            // Capture more than just the absolute top results so that
            // popular chains and nearby options have a better chance
            // of being included in our custom database.
            return places.slice(0, 12); // Take top 12
        });

        console.log(`Successfully scraped ${results.length} raw locations.`);

        if (results.length === 0) {
            console.log("No results found on the first page. Note: Google Maps might be blocking headless requests or changing DOM selectors.");
            await browser.close();
            return;
        }

        // Phase 13: Deep Review Extraction
        console.log('Navigating to physical place pages to scrape user reviews... (This will take longer due to active loading requests)');
        for (let i = 0; i < results.length; i++) {
            const place = results[i];
            console.log(`[${i + 1}/${results.length}] Deep Scraping: ${place.name}`);
            try {
                const detailPage = await browser.newPage();
                await detailPage.setViewport({ width: 1920, height: 1080 });
                await detailPage.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');

                // Allow network requests to settle so the dynamic review div renders
                await detailPage.goto(place.url, { waitUntil: 'networkidle2', timeout: 35000 });

                // Maps often requires a brief physical pause to hydrate React children components fully
                await delay(3000);

                // Extract up to 3 text reviews
                const extractedReviews = await detailPage.evaluate(() => {
                    const reviewElements = Array.from(document.querySelectorAll('.wiI7pd'));
                    // wiI7pd is the highly-stable Google Maps class for body text of user reviews
                    return reviewElements
                        .map(el => el.textContent.trim())
                        .filter(text => text.length > 10) // Ignore "Good." or empty strings
                        .slice(0, 3);
                });

                place.reviews = extractedReviews;

                // CRITICAL: Capture the final URL after navigation/redirection. 
                // This URL usually contains the @lat,lon coordinates needed for spatial search.
                const finalUrl = detailPage.url();
                if (finalUrl.includes('@')) {
                    place.url = finalUrl;
                }

                console.log(`    -> Harvested ${extractedReviews.length} reviews and updated URL.`);

                await detailPage.close();
            } catch (deepScrapeErr) {
                console.error(`    -> Failed to deep scrape ${place.name}: ${deepScrapeErr.message}`);
                place.reviews = [];
            }
        }

        // Clean and prepare the scraped data for our DB
        console.log('Inserting scraped data into custom SQLite database...');
        let insertedCount = 0;

        for (const place of results) {
            const id = `gmaps-${crypto.createHash('md5').update(place.name).digest('hex').substring(0, 10)}`;
            const realAddress = place.address || `${place.name}, ${city}`;

            // Actual data from extraction
            const rating = place.rating || 'N/A';
            const priceRange = place.priceRange || 'N/A';

            // Pass the raw category to the AI ranker via the features array
            const featuresArray = ['Scraped from Maps'];
            if (place.rawCategory) {
                featuresArray.push(`Actual Map Type: ${place.rawCategory}`);
            }
            const mockFeatures = JSON.stringify(featuresArray);

            // Phase 13 stringification
            const stringifiedReviews = JSON.stringify(place.reviews || []);

            // Extract lat/lng from the URL if possible
            let lat = null, lon = null;
            const coordMatch = place.url.match(/@(-?\d+\.\d+),(-?\d+\.\d+)/);
            if (coordMatch) {
                lat = parseFloat(coordMatch[1]);
                lon = parseFloat(coordMatch[2]);
            }

            try {
                await db.run(`
                    INSERT OR REPLACE INTO places 
                    (id, name, category, address, city, rating, priceRange, features, reviews, lat, lon) 
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                `, [id, place.name, category, realAddress, city, rating, priceRange, mockFeatures, stringifiedReviews, lat, lon]);

                insertedCount++;
                console.log(` ✅ Saved: ${place.name} ${lat ? `(${lat}, ${lon})` : ''}`);
            } catch (err) {
                console.error(` ❌ Failed to save ${place.name}:`, err.message);
            }
        }

        console.log(`\n🎉 Scraper finished! Saved ${insertedCount} locations to custom database.`);
        return true;

    } catch (error) {
        console.error('Fatal Web Scraping Error:', error);
        throw error; // Let the calling API route handle the error state
    } finally {
        if (browser) await browser.close();
    }
}
// Allow running from CLI directly: node src/scripts/scraper.js "cafe in bhubaneswar" cafe bhubaneswar
if (require.main === module) {
    const args = process.argv.slice(2);
    if (args.length < 3) {
        console.log("Usage: node scraper.js <search_query> <category> <city> [lat] [lng]");
        console.log('Example: node scraper.js "best cafe" cafe patia 20.2961 85.8245');
        process.exit(1);
    }

    const loc = (args[3] && args[4]) ? { lat: args[3], lng: args[4] } : null;

    scrapeGoogleMaps(args[0], args[1], args[2], loc).then(() => {
        console.log("Database connection closed.");
        process.exit(0);
    });
}

module.exports = scrapeGoogleMaps;