TushP commited on
Commit
2fd9100
Β·
verified Β·
1 Parent(s): 70bcde7

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. src/scrapers/google_maps_scraper.py +547 -336
src/scrapers/google_maps_scraper.py CHANGED
@@ -1,432 +1,603 @@
1
- """
2
- Google Maps Review Scraper - VERIFIED SELECTORS VERSION
3
- Based on confirmed DOM structure research (Nov 2024).
4
-
5
- VERIFIED SELECTORS:
6
- - Reviews tab: button.hh2c6.G7m0Af with aria-label="Reviews"
7
- - Scroll container: div.m6QErb.DxyBCb with role="feed"
8
- - Review card: div.jftiEf.fontBodyMedium with data-review-id
9
- - Review text: span.wiI7pd
10
- - More button: button.w8nwRe.kyuRq
11
 
12
- NO RETRY DELAYS - keep it simple like the OpenTable scraper that works.
 
 
 
 
 
13
  """
14
 
15
  import time
 
 
16
  from typing import List, Dict, Any, Optional, Callable
17
  from selenium import webdriver
18
- from selenium.webdriver.chrome.options import Options
19
- from selenium.webdriver.chrome.service import Service
20
- from selenium.webdriver.common.by import By
21
- from selenium.webdriver.common.keys import Keys
22
- from selenium.webdriver.common.action_chains import ActionChains
23
- from selenium.webdriver.support.ui import WebDriverWait
24
- from selenium.webdriver.support import expected_conditions as EC
25
  from selenium.common.exceptions import (
26
- TimeoutException,
27
  NoSuchElementException,
28
  StaleElementReferenceException,
29
- WebDriverException
 
30
  )
 
 
 
 
 
 
 
31
 
32
 
33
  class GoogleMapsScraper:
34
  """
35
- Google Maps review scraper with VERIFIED selectors.
 
 
 
 
 
 
 
 
 
36
  """
37
 
38
- # VERIFIED selectors from DOM research
39
  SELECTORS = {
 
 
 
 
 
40
  # Reviews tab button
41
  "reviews_tab": [
42
- "//button[@aria-label='Reviews']",
43
- "//button[contains(@class, 'hh2c6')]",
44
- "//button[@data-tab-index='1']",
45
- "//div[@role='tablist']//button[contains(., 'Review')]",
46
  ],
47
 
48
- # Scrollable container - from working version
49
- "scroll_container": [
50
  "//div[contains(@class, 'm6QErb') and contains(@class, 'DxyBCb')]",
51
  "//div[contains(@class, 'XiKgde')]",
52
  "//div[@role='feed']",
53
  "//div[contains(@class, 'm6QErb')][@tabindex='-1']",
54
  ],
55
 
56
- # Individual review cards - VERIFIED: div.jftiEf with data-review-id
57
  "review_cards": [
58
  "//div[@data-review-id]",
59
- "//div[contains(@class, 'jftiEf')]",
60
  "//div[contains(@class, 'jftiEf') and contains(@class, 'fontBodyMedium')]",
 
61
  ],
62
 
63
- # Review text - from working version with expanded text support
64
- "review_text": [
65
- ".//span[contains(@class, 'wiI7pd')]",
66
- ".//span[@jsname='fbQN7e']", # Full expanded text
67
- ".//span[@jsname='bN97Pc']", # Truncated text
68
- ".//div[contains(@class, 'MyEned')]//span",
69
- ],
70
-
71
- # "More" button to expand text - VERIFIED: button.w8nwRe
72
- "more_button": [
73
- ".//button[contains(@class, 'w8nwRe')]",
74
- ".//button[@aria-expanded='false']",
75
- ".//button[contains(text(), 'More')]",
76
  ],
77
 
78
- # Rating - span.kvMYJc or aria-label with stars
79
  "rating": [
80
- ".//span[contains(@class, 'kvMYJc')]",
81
- ".//span[contains(@aria-label, 'star')]",
 
82
  ],
83
 
84
- # Date - span.rsqaWe
85
  "date": [
86
  ".//span[contains(@class, 'rsqaWe')]",
87
  ".//span[contains(text(), 'ago')]",
 
 
 
 
88
  ],
89
 
90
- # Reviewer name - div.d4r55
91
- "reviewer_name": [
92
- ".//div[contains(@class, 'd4r55')]",
93
- ".//button[contains(@class, 'WEBjve')]//div",
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  ],
95
  }
96
 
97
- def __init__(self, headless: bool = True):
 
 
 
 
 
 
 
98
  self.headless = headless
99
  self.driver = None
100
  self.wait = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  def scrape_reviews(
103
  self,
104
  url: str,
105
- max_reviews: Optional[int] = 100,
106
  progress_callback: Optional[Callable[[str], None]] = None
107
  ) -> Dict[str, Any]:
108
  """
109
- Scrape reviews from Google Maps.
 
 
 
 
 
 
 
 
110
  """
111
  if not self._validate_url(url):
112
- return {'success': False, 'error': 'Invalid Google Maps URL', 'reviews': [], 'total_reviews': 0}
 
 
 
 
113
 
114
  try:
115
  self._init_driver()
116
  except Exception as e:
117
- return {'success': False, 'error': f'Browser init failed: {str(e)}', 'reviews': [], 'total_reviews': 0}
118
-
119
- reviews = []
120
- dates = []
121
- ratings = []
122
- names = []
123
 
124
  try:
125
- self._log("πŸš€ Starting Google Maps scraper...", progress_callback)
126
  self.driver.get(url)
127
- time.sleep(5) # Wait for initial load
128
 
129
- # Handle consent dialog if it appears
130
- self._handle_consent_dialog()
 
 
131
 
132
- # Click on Reviews tab
133
- self._log("πŸ“‹ Looking for Reviews tab...", progress_callback)
134
- if not self._click_reviews_tab(progress_callback):
135
- self._log("⚠️ Could not find Reviews tab, trying to scroll anyway...", progress_callback)
136
-
137
- time.sleep(3) # Wait for reviews to load
138
 
139
  # Find scrollable container
140
- scroll_container = self._find_scroll_container(progress_callback)
 
 
 
 
 
 
 
 
141
 
142
- # Scroll and collect reviews
143
- collected_ids = set()
144
- no_new_count = 0
145
  scroll_count = 0
146
- max_scrolls = min(100, max(20, (max_reviews or 100) // 2))
 
147
 
148
- while len(reviews) < (max_reviews or 100) and scroll_count < max_scrolls:
 
 
149
  scroll_count += 1
150
 
151
- # Find review cards
152
- review_cards = self._find_review_cards()
 
 
 
 
 
 
 
 
153
 
154
- new_count = 0
155
- for card in review_cards:
 
 
 
 
 
156
  try:
157
- # Get unique ID
158
- card_id = card.get_attribute('data-review-id')
159
- if not card_id:
160
- card_id = str(id(card))
161
-
162
- if card_id in collected_ids:
163
- continue
164
-
165
- # Click "More" to expand if needed
166
- self._expand_review(card)
167
-
168
- # Extract text - try multiple approaches like working version
169
- text = ""
170
- # First try expanded text selectors
171
- for selector in [".//span[@jsname='fbQN7e']", ".//span[contains(@class, 'wiI7pd')]"]:
172
- try:
173
- elem = card.find_element(By.XPATH, selector)
174
- t = elem.text.strip()
175
- if t and len(t) > len(text):
176
- text = t
177
- except:
178
- continue
179
-
180
- # Fallback to general text selectors
181
- if not text:
182
- text = self._extract_text(card, self.SELECTORS["review_text"])
183
-
184
- if text and len(text.strip()) > 10:
185
- collected_ids.add(card_id)
186
- reviews.append(text)
187
- dates.append(self._extract_text(card, self.SELECTORS["date"]))
188
- ratings.append(self._extract_rating(card))
189
- names.append(self._extract_text(card, self.SELECTORS["reviewer_name"]))
190
- new_count += 1
191
-
192
- if len(reviews) >= (max_reviews or 100):
193
- break
194
 
195
- except StaleElementReferenceException:
 
196
  continue
197
- except Exception:
198
- continue
199
-
200
- self._log(f"πŸ“„ Scroll {scroll_count}: Found {len(review_cards)} review cards, collected {len(reviews)} unique reviews", progress_callback)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- if new_count == 0:
203
- no_new_count += 1
204
- if no_new_count >= 5:
205
- self._log("πŸ“ No new reviews after 5 scrolls, stopping", progress_callback)
206
- break
207
  else:
208
- no_new_count = 0
209
 
210
- # Scroll down
211
- self._scroll_down(scroll_container)
212
- time.sleep(1.5)
 
 
 
 
 
 
 
 
 
213
 
214
- self._log(f"βœ… Scraped {len(reviews)} reviews from Google Maps", progress_callback)
215
 
216
- if len(reviews) == 0:
217
- return {
218
- 'success': False,
219
- 'error': 'No reviews found. Selectors may need updating.',
220
- 'reviews': {},
221
- 'total_reviews': 0
222
- }
 
 
 
 
223
 
224
- # Return NESTED format matching working version
225
  return {
226
  'success': True,
227
- 'total_reviews': len(reviews),
228
  'total_pages': scroll_count,
229
- 'reviews': { # NESTED dict like working version
230
  'names': names,
231
  'dates': dates,
232
  'overall_ratings': ratings,
 
233
  'food_ratings': [0.0] * len(ratings),
234
  'service_ratings': [0.0] * len(ratings),
235
  'ambience_ratings': [0.0] * len(ratings),
236
- 'review_texts': reviews # 'review_texts' not 'reviews'
237
  },
238
  'metadata': {
239
  'source': 'google_maps',
 
240
  'scroll_count': scroll_count
241
  }
242
  }
243
 
244
- except TimeoutException as e:
245
- return {'success': False, 'error': f'Page load timeout: {str(e)}', 'reviews': [], 'total_reviews': 0}
246
- except WebDriverException as e:
247
- return {'success': False, 'error': f'Browser error: {str(e)}', 'reviews': [], 'total_reviews': 0}
248
  except Exception as e:
249
- return {'success': False, 'error': f'Scraping error: {str(e)}', 'reviews': [], 'total_reviews': 0}
250
- finally:
251
  self._cleanup()
252
-
253
- def _handle_consent_dialog(self):
254
- """Handle Google consent/cookie dialog."""
255
- try:
256
- # Try various consent buttons
257
- for selector in ["//button[contains(., 'Accept')]", "//button[contains(., 'Reject all')]", "//form//button"]:
258
- try:
259
- btn = self.driver.find_element(By.XPATH, selector)
260
- if btn.is_displayed():
261
- btn.click()
262
- time.sleep(1)
263
- return
264
- except:
265
- continue
266
- except:
267
- pass
268
-
269
- def _click_reviews_tab(self, progress_callback: Optional[Callable]) -> bool:
270
- """Click the Reviews tab."""
271
- for selector in self.SELECTORS["reviews_tab"]:
272
- try:
273
- tab = WebDriverWait(self.driver, 10).until(
274
- EC.element_to_be_clickable((By.XPATH, selector))
275
- )
276
- tab.click()
277
- self._log("βœ… Clicked Reviews tab", progress_callback)
278
- return True
279
- except:
280
- continue
281
-
282
- # Fallback: look for any button containing "Review" text
283
- try:
284
- buttons = self.driver.find_elements(By.TAG_NAME, "button")
285
- for btn in buttons:
286
- if 'review' in btn.text.lower():
287
- btn.click()
288
- self._log("βœ… Clicked Reviews tab (text match)", progress_callback)
289
- return True
290
- except:
291
- pass
292
-
293
- return False
294
-
295
- def _find_scroll_container(self, progress_callback: Optional[Callable]):
296
- """Find the scrollable reviews container."""
297
- for selector in self.SELECTORS["scroll_container"]:
298
- try:
299
- container = WebDriverWait(self.driver, 5).until(
300
- EC.presence_of_element_located((By.XPATH, selector))
301
- )
302
- self._log("βœ… Found scrollable container", progress_callback)
303
- return container
304
- except:
305
- continue
306
-
307
- self._log("⚠️ Could not find scrollable reviews container", progress_callback)
308
- return None
309
-
310
- def _find_review_cards(self) -> List:
311
- """Find all review cards."""
312
- for selector in self.SELECTORS["review_cards"]:
313
- try:
314
- cards = self.driver.find_elements(By.XPATH, selector)
315
- if cards:
316
- return cards
317
- except:
318
- continue
319
- return []
320
-
321
- def _expand_review(self, card):
322
- """Click "More" button to expand review text."""
323
- for selector in self.SELECTORS["more_button"]:
324
- try:
325
- btn = card.find_element(By.XPATH, selector)
326
- if btn.is_displayed():
327
- btn.click()
328
- time.sleep(0.3)
329
- return
330
- except:
331
- continue
332
-
333
- def _extract_text(self, card, selectors: List[str]) -> str:
334
- """Extract text using fallback selectors."""
335
- for selector in selectors:
336
- try:
337
- elem = card.find_element(By.XPATH, selector)
338
- text = elem.text.strip()
339
- if text:
340
- return text
341
- except:
342
- continue
343
- return ""
344
-
345
- def _extract_rating(self, card) -> float:
346
- """Extract star rating."""
347
- for selector in self.SELECTORS["rating"]:
348
- try:
349
- elem = card.find_element(By.XPATH, selector)
350
- # Try aria-label first
351
- aria = elem.get_attribute('aria-label') or ""
352
- for word in aria.split():
353
- try:
354
- return float(word)
355
- except:
356
- continue
357
- # Try text content
358
- text = elem.text
359
- for word in text.split():
360
- try:
361
- return float(word)
362
- except:
363
- continue
364
- except:
365
- continue
366
- return 0.0
367
-
368
- def _scroll_down(self, container):
369
- """Scroll down in the container - matches working version."""
370
- try:
371
- if container:
372
- # Scroll to bottom (like working version)
373
- self.driver.execute_script(
374
- "arguments[0].scrollTop = arguments[0].scrollHeight",
375
- container
376
- )
377
- else:
378
- # Fallback: scroll the page
379
- ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()
380
- except:
381
- ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()
382
-
383
- def _init_driver(self):
384
- """Initialize Chrome - SIMPLE settings like OpenTable."""
385
- chrome_options = Options()
386
- chrome_options.page_load_strategy = 'eager' # Fast like OpenTable
387
-
388
- if self.headless:
389
- chrome_options.add_argument('--headless=new')
390
- chrome_options.add_argument('--no-sandbox')
391
- chrome_options.add_argument('--disable-dev-shm-usage')
392
- chrome_options.add_argument('--disable-gpu')
393
-
394
- # User agent
395
- chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
396
-
397
- # Anti-detection
398
- chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
399
- chrome_options.add_experimental_option('useAutomationExtension', False)
400
-
401
- service = Service('/usr/local/bin/chromedriver')
402
- self.driver = webdriver.Chrome(service=service, options=chrome_options)
403
- self.driver.set_page_load_timeout(30)
404
- self.wait = WebDriverWait(self.driver, 10)
405
-
406
- # Anti-detection CDP command (from working version)
407
- self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
408
- 'source': '''
409
- Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
410
- Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
411
- '''
412
- })
413
-
414
- def _cleanup(self):
415
- """Close browser."""
416
- if self.driver:
417
- try:
418
- self.driver.quit()
419
- except:
420
- pass
421
- self.driver = None
422
 
423
  def _validate_url(self, url: str) -> bool:
424
  """Validate Google Maps URL."""
 
 
425
  url_lower = url.lower()
426
- return any(x in url_lower for x in ['google.com/maps', 'goo.gl/maps', 'maps.google', 'maps.app.goo.gl'])
 
 
 
 
 
427
 
428
- def _log(self, message: str, callback: Optional[Callable]):
429
- """Log progress."""
430
  print(message)
431
  if callback:
432
  callback(message)
@@ -435,23 +606,63 @@ class GoogleMapsScraper:
435
  self._cleanup()
436
 
437
 
438
- def scrape_google_maps(url: str, max_reviews: Optional[int] = 100, headless: bool = True) -> Dict[str, Any]:
 
 
 
 
 
439
  """
440
  Scrape reviews from Google Maps.
 
 
 
 
 
 
 
 
 
 
441
  """
442
- scraper = GoogleMapsScraper(headless=headless)
443
- return scraper.scrape_reviews(url, max_reviews)
444
 
445
 
446
  if __name__ == "__main__":
447
- test_url = "https://www.google.com/maps/place/Nightingale/@49.2784422,-123.1214336,17z"
448
- result = scrape_google_maps(test_url, max_reviews=20)
 
 
 
 
449
 
450
- print(f"\n{'='*60}")
451
- print(f"Success: {result.get('success')}")
452
- print(f"Reviews: {result.get('total_reviews', 0)}")
453
- if result.get('error'):
454
- print(f"Error: {result.get('error')}")
455
- if result.get('reviews'):
456
- print(f"\nFirst review: {result['reviews'][0][:100]}...")
457
- print(f"{'='*60}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Google Maps Review Scraper - 2025 Production Version
2
+ Updated with verified selectors from actual Google Maps DOM inspection.
 
 
 
 
 
 
 
 
3
 
4
+ Key improvements:
5
+ 1. Updated selectors based on actual HTML structure
6
+ 2. Better "More" button handling for truncated reviews
7
+ 3. Improved star rating extraction via aria-label
8
+ 4. Robust error handling and fallbacks
9
+ 5. Configurable chromedriver path
10
  """
11
 
12
  import time
13
+ import re
14
+ import os
15
  from typing import List, Dict, Any, Optional, Callable
16
  from selenium import webdriver
 
 
 
 
 
 
 
17
  from selenium.common.exceptions import (
 
18
  NoSuchElementException,
19
  StaleElementReferenceException,
20
+ TimeoutException,
21
+ ElementClickInterceptedException
22
  )
23
+ from selenium.webdriver.common.by import By
24
+ from selenium.webdriver.support.ui import WebDriverWait
25
+ from selenium.webdriver.support import expected_conditions as EC
26
+ from selenium.webdriver.chrome.options import Options
27
+ from selenium.webdriver.chrome.service import Service
28
+ from selenium.webdriver.common.action_chains import ActionChains
29
+ import random
30
 
31
 
32
  class GoogleMapsScraper:
33
  """
34
+ Scrapes restaurant reviews from Google Maps.
35
+
36
+ Selectors updated based on actual DOM inspection (Nov 2025):
37
+ - Review cards: div.jftiEf or div[data-review-id]
38
+ - Reviewer name: div.d4r55
39
+ - Star rating: span with aria-label containing "star"
40
+ - Date: span.rsqaWe (contains "ago")
41
+ - Review text: span.wiI7pd (truncated) or span[jsname='fbQN7e'] (full)
42
+ - More button: Various elements containing "More"
43
+ - Scrollable container: div.m6QErb or div.XiKgde
44
  """
45
 
46
+ # Updated selectors based on actual Google Maps DOM (Nov 2025)
47
  SELECTORS = {
48
+ # Parent container for the business listing
49
+ "business_container": [
50
+ "//div[contains(@class, 'WNBkOb')]",
51
+ ],
52
+
53
  # Reviews tab button
54
  "reviews_tab": [
55
+ "//button[@role='tab'][contains(., 'Reviews')]",
56
+ "//button[contains(@aria-label, 'Reviews')]",
57
+ "//div[@role='tab'][contains(., 'Reviews')]",
58
+ "//button[contains(@data-tab-index, '1')]", # Reviews is often tab index 1
59
  ],
60
 
61
+ # Scrollable reviews container
62
+ "scrollable_div": [
63
  "//div[contains(@class, 'm6QErb') and contains(@class, 'DxyBCb')]",
64
  "//div[contains(@class, 'XiKgde')]",
65
  "//div[@role='feed']",
66
  "//div[contains(@class, 'm6QErb')][@tabindex='-1']",
67
  ],
68
 
69
+ # Individual review cards
70
  "review_cards": [
71
  "//div[@data-review-id]",
 
72
  "//div[contains(@class, 'jftiEf') and contains(@class, 'fontBodyMedium')]",
73
+ "//div[contains(@class, 'jftiEf')]",
74
  ],
75
 
76
+ # Reviewer name - updated without trailing space
77
+ "reviewer_name": [
78
+ ".//div[contains(@class, 'd4r55')]",
79
+ ".//button[contains(@class, 'WEBjve')]",
80
+ ".//div[@role='article']//span[contains(@class, 'd4r55')]",
81
+ ".//a[contains(@class, 'WNBkOb')]//div[1]",
 
 
 
 
 
 
 
82
  ],
83
 
84
+ # Star rating - use aria-label attribute
85
  "rating": [
86
+ ".//span[@aria-label][contains(@aria-label, 'star')]",
87
+ ".//span[contains(@class, 'kvMYJc')]//span[@aria-label]",
88
+ ".//div[@role='img'][@aria-label]",
89
  ],
90
 
91
+ # Review date
92
  "date": [
93
  ".//span[contains(@class, 'rsqaWe')]",
94
  ".//span[contains(text(), 'ago')]",
95
+ ".//span[contains(text(), 'week')]",
96
+ ".//span[contains(text(), 'month')]",
97
+ ".//span[contains(text(), 'day')]",
98
+ ".//span[contains(text(), 'year')]",
99
  ],
100
 
101
+ # Review text - both truncated and full versions
102
+ "review_text": [
103
+ ".//span[contains(@class, 'wiI7pd')]",
104
+ ".//span[@jsname='fbQN7e']", # Full expanded text
105
+ ".//span[@jsname='bN97Pc']", # Truncated text
106
+ ".//div[contains(@class, 'MyEned')]//span",
107
+ ],
108
+
109
+ # "More" button for expanding truncated reviews
110
+ "more_button": [
111
+ ".//button[contains(@class, 'w8nwRe')]",
112
+ ".//button[contains(@class, 'kyuUzc')]",
113
+ ".//button[contains(@aria-label, 'More')]",
114
+ ".//button[contains(@aria-label, 'more')]",
115
+ ".//span[text()='More']",
116
+ ".//button[.//span[text()='More']]",
117
+ ".//*[contains(text(), 'More') and not(contains(text(), 'More reviews'))]",
118
  ],
119
  }
120
 
121
+ def __init__(self, headless: bool = True, chromedriver_path: Optional[str] = None):
122
+ """
123
+ Initialize the scraper.
124
+
125
+ Args:
126
+ headless: Run browser in headless mode
127
+ chromedriver_path: Path to chromedriver (auto-detected if None)
128
+ """
129
  self.headless = headless
130
  self.driver = None
131
  self.wait = None
132
+ self.chromedriver_path = chromedriver_path or self._find_chromedriver()
133
+
134
+ def _find_chromedriver(self) -> str:
135
+ """Find chromedriver in common locations."""
136
+ common_paths = [
137
+ '/usr/local/bin/chromedriver',
138
+ '/usr/bin/chromedriver',
139
+ '/opt/chromedriver',
140
+ 'chromedriver', # In PATH
141
+ ]
142
+
143
+ for path in common_paths:
144
+ if os.path.exists(path):
145
+ return path
146
+
147
+ # Try webdriver-manager if available
148
+ try:
149
+ from webdriver_manager.chrome import ChromeDriverManager
150
+ return ChromeDriverManager().install()
151
+ except ImportError:
152
+ pass
153
+ except Exception as e:
154
+ print(f"[GMAPS] webdriver-manager failed: {e}")
155
+
156
+ # Default fallback - will be used on Modal which has chromedriver installed
157
+ return '/usr/local/bin/chromedriver'
158
+
159
+ def _init_driver(self):
160
+ """Initialize Chrome WebDriver with anti-detection settings."""
161
+ chrome_options = Options()
162
+
163
+ if self.headless:
164
+ chrome_options.add_argument('--headless=new')
165
+ chrome_options.add_argument('--no-sandbox')
166
+ chrome_options.add_argument('--disable-dev-shm-usage')
167
+ chrome_options.add_argument('--disable-gpu')
168
+
169
+ # Larger window for better element visibility
170
+ chrome_options.add_argument('--window-size=1920,1080')
171
+
172
+ # Realistic user agent
173
+ chrome_options.add_argument(
174
+ '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
175
+ 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
176
+ )
177
+
178
+ # Anti-detection measures
179
+ chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
180
+ chrome_options.add_experimental_option('useAutomationExtension', False)
181
+ chrome_options.add_argument('--disable-blink-features=AutomationControlled')
182
+
183
+ # Disable images for faster loading (optional - comment out if you need images)
184
+ # prefs = {"profile.managed_default_content_settings.images": 2}
185
+ # chrome_options.add_experimental_option("prefs", prefs)
186
+
187
+ try:
188
+ service = Service(self.chromedriver_path)
189
+ self.driver = webdriver.Chrome(service=service, options=chrome_options)
190
+ except Exception:
191
+ # Fallback: try without explicit service path
192
+ self.driver = webdriver.Chrome(options=chrome_options)
193
+
194
+ self.driver.set_page_load_timeout(45)
195
+ self.wait = WebDriverWait(self.driver, 15)
196
+
197
+ # Additional anti-detection
198
+ self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
199
+ 'source': '''
200
+ Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
201
+ Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
202
+ '''
203
+ })
204
+
205
+ def _cleanup(self):
206
+ """Close browser."""
207
+ if self.driver:
208
+ try:
209
+ self.driver.quit()
210
+ except:
211
+ pass
212
+ self.driver = None
213
+
214
+ def _random_delay(self, min_sec: float = 0.5, max_sec: float = 1.5):
215
+ """Add random delay to mimic human behavior."""
216
+ time.sleep(random.uniform(min_sec, max_sec))
217
+
218
+ def _find_element_with_fallback(self, parent, selectors: List[str]):
219
+ """Try multiple selectors until one works."""
220
+ for selector in selectors:
221
+ try:
222
+ element = parent.find_element(By.XPATH, selector)
223
+ if element:
224
+ return element
225
+ except (NoSuchElementException, StaleElementReferenceException):
226
+ continue
227
+ return None
228
+
229
+ def _find_elements_with_fallback(self, selectors: List[str]) -> List:
230
+ """Try multiple selectors until one returns elements."""
231
+ for selector in selectors:
232
+ try:
233
+ elements = self.driver.find_elements(By.XPATH, selector)
234
+ if elements:
235
+ return elements
236
+ except:
237
+ continue
238
+ return []
239
+
240
+ def _extract_rating(self, review_element) -> float:
241
+ """
242
+ Extract star rating from review using aria-label.
243
+ Looks for aria-label like "5 stars" or "4 stars".
244
+ """
245
+ for selector in self.SELECTORS["rating"]:
246
+ try:
247
+ elem = review_element.find_element(By.XPATH, selector)
248
+ aria_label = elem.get_attribute('aria-label')
249
+ if aria_label:
250
+ # Extract number from "5 stars" or "4 stars"
251
+ match = re.search(r'(\d+)\s*star', aria_label.lower())
252
+ if match:
253
+ return float(match.group(1))
254
+ except (NoSuchElementException, StaleElementReferenceException):
255
+ continue
256
+ return 0.0
257
+
258
+ def _extract_text(self, parent, selectors: List[str]) -> str:
259
+ """Extract text using fallback selectors."""
260
+ for selector in selectors:
261
+ try:
262
+ element = parent.find_element(By.XPATH, selector)
263
+ text = element.text.strip()
264
+ if text:
265
+ return text
266
+ except (NoSuchElementException, StaleElementReferenceException):
267
+ continue
268
+ return ""
269
+
270
+ def _expand_review_text(self, review_element):
271
+ """
272
+ Click 'More' button to expand truncated review text.
273
+ Google Maps uses various elements for the More button.
274
+ """
275
+ for selector in self.SELECTORS["more_button"]:
276
+ try:
277
+ more_btn = review_element.find_element(By.XPATH, selector)
278
+ if more_btn and more_btn.is_displayed():
279
+ try:
280
+ # Try regular click first
281
+ more_btn.click()
282
+ except ElementClickInterceptedException:
283
+ # Fallback to JavaScript click
284
+ self.driver.execute_script("arguments[0].click();", more_btn)
285
+ self._random_delay(0.3, 0.6)
286
+ return True
287
+ except (NoSuchElementException, StaleElementReferenceException):
288
+ continue
289
+ return False
290
+
291
+ def _get_scrollable_element(self):
292
+ """Find the scrollable reviews container."""
293
+ for selector in self.SELECTORS["scrollable_div"]:
294
+ try:
295
+ element = self.driver.find_element(By.XPATH, selector)
296
+ if element:
297
+ return element
298
+ except NoSuchElementException:
299
+ continue
300
+ return None
301
+
302
+ def _scroll_reviews(self, scrollable_element, scroll_pause: float = 1.5):
303
+ """
304
+ Scroll the reviews panel to load more reviews.
305
+ Uses JavaScript to scroll the container.
306
+ """
307
+ if not scrollable_element:
308
+ return False
309
+
310
+ try:
311
+ # Get current scroll height
312
+ last_height = self.driver.execute_script(
313
+ "return arguments[0].scrollHeight",
314
+ scrollable_element
315
+ )
316
+
317
+ # Scroll to bottom
318
+ self.driver.execute_script(
319
+ "arguments[0].scrollTop = arguments[0].scrollHeight",
320
+ scrollable_element
321
+ )
322
+
323
+ # Wait for new content to load
324
+ time.sleep(scroll_pause + random.uniform(0, 0.5))
325
+
326
+ # Check if new content loaded
327
+ new_height = self.driver.execute_script(
328
+ "return arguments[0].scrollHeight",
329
+ scrollable_element
330
+ )
331
+
332
+ return new_height > last_height
333
+ except Exception as e:
334
+ print(f"Scroll error: {e}")
335
+ return False
336
+
337
+ def _click_reviews_tab(self) -> bool:
338
+ """Click on the Reviews tab to show reviews."""
339
+ for selector in self.SELECTORS["reviews_tab"]:
340
+ try:
341
+ tab = self.wait.until(EC.element_to_be_clickable((By.XPATH, selector)))
342
+ tab.click()
343
+ time.sleep(3) # Wait for reviews to load
344
+ return True
345
+ except (TimeoutException, NoSuchElementException, ElementClickInterceptedException):
346
+ continue
347
+ return False
348
+
349
+ def _parse_relative_date(self, date_str: str) -> str:
350
+ """
351
+ Return the date string as-is.
352
+ The UI code handles "X days ago" format already.
353
+ """
354
+ if not date_str:
355
+ return ""
356
+ return date_str.strip()
357
+
358
+ def _extract_review_data(self, review_element, idx: int) -> Optional[Dict]:
359
+ """
360
+ Extract all data from a single review card.
361
+ Returns None if extraction fails or review is invalid.
362
+ """
363
+ try:
364
+ # Try to expand truncated text first
365
+ self._expand_review_text(review_element)
366
+
367
+ # Extract reviewer name
368
+ name = self._extract_text(review_element, self.SELECTORS["reviewer_name"])
369
+
370
+ # Extract date
371
+ date = self._extract_text(review_element, self.SELECTORS["date"])
372
+
373
+ # Extract star rating
374
+ rating = self._extract_rating(review_element)
375
+
376
+ # Extract review text (try expanded first, then truncated)
377
+ text = ""
378
+ # First try to get the expanded/full text
379
+ for selector in [".//span[@jsname='fbQN7e']", ".//span[contains(@class, 'wiI7pd')]"]:
380
+ try:
381
+ elem = review_element.find_element(By.XPATH, selector)
382
+ t = elem.text.strip()
383
+ if t and len(t) > len(text):
384
+ text = t
385
+ except:
386
+ continue
387
+
388
+ # Fallback to general text selectors
389
+ if not text:
390
+ text = self._extract_text(review_element, self.SELECTORS["review_text"])
391
+
392
+ # Validate - must have meaningful text
393
+ if not text or len(text) < 10:
394
+ return None
395
+
396
+ return {
397
+ 'name': name,
398
+ 'date': self._parse_relative_date(date),
399
+ 'rating': rating,
400
+ 'text': text
401
+ }
402
+
403
+ except StaleElementReferenceException:
404
+ return None
405
+ except Exception as e:
406
+ print(f"[GMAPS] Error extracting review {idx}: {e}")
407
+ return None
408
 
409
  def scrape_reviews(
410
  self,
411
  url: str,
412
+ max_reviews: Optional[int] = None,
413
  progress_callback: Optional[Callable[[str], None]] = None
414
  ) -> Dict[str, Any]:
415
  """
416
+ Scrape reviews from Google Maps restaurant page.
417
+
418
+ Args:
419
+ url: Google Maps restaurant URL
420
+ max_reviews: Maximum number of reviews to scrape
421
+ progress_callback: Optional callback for progress updates
422
+
423
+ Returns:
424
+ Dict with reviews data in same format as OpenTable scraper
425
  """
426
  if not self._validate_url(url):
427
+ return {
428
+ 'success': False,
429
+ 'error': 'Invalid Google Maps URL. Use google.com/maps or goo.gl/maps',
430
+ 'reviews': {}
431
+ }
432
 
433
  try:
434
  self._init_driver()
435
  except Exception as e:
436
+ return {
437
+ 'success': False,
438
+ 'error': f'Browser initialization failed: {str(e)}',
439
+ 'reviews': {}
440
+ }
 
441
 
442
  try:
443
+ self._log_progress("πŸš€ Starting Google Maps scraper...", progress_callback)
444
  self.driver.get(url)
445
+ time.sleep(5) # Wait for initial page load
446
 
447
+ # Click Reviews tab
448
+ self._log_progress("πŸ“‹ Looking for Reviews tab...", progress_callback)
449
+ if not self._click_reviews_tab():
450
+ self._log_progress("⚠️ Could not find Reviews tab, trying to scroll anyway...", progress_callback)
451
 
452
+ time.sleep(3)
 
 
 
 
 
453
 
454
  # Find scrollable container
455
+ scrollable = self._get_scrollable_element()
456
+ if not scrollable:
457
+ self._log_progress("⚠️ Could not find scrollable reviews container", progress_callback)
458
+
459
+ # Initialize data containers
460
+ names = []
461
+ dates = []
462
+ ratings = []
463
+ review_texts = []
464
 
465
+ processed_ids = set() # Track processed reviews to avoid duplicates
 
 
466
  scroll_count = 0
467
+ no_new_reviews_count = 0
468
+ max_no_new = 5 # Stop after 5 scrolls with no new reviews
469
 
470
+ max_scrolls = (max_reviews // 3) + 20 if max_reviews else 100
471
+
472
+ while scroll_count < max_scrolls and no_new_reviews_count < max_no_new:
473
  scroll_count += 1
474
 
475
+ # Find all review cards
476
+ review_elements = self._find_elements_with_fallback(self.SELECTORS["review_cards"])
477
+
478
+ self._log_progress(
479
+ f"πŸ“„ Scroll {scroll_count}: Found {len(review_elements)} review cards, "
480
+ f"collected {len(review_texts)} unique reviews",
481
+ progress_callback
482
+ )
483
+
484
+ new_reviews_this_scroll = 0
485
 
486
+ # Process each review card
487
+ for idx, review_elem in enumerate(review_elements):
488
+ # Check if we've reached the limit
489
+ if max_reviews and len(review_texts) >= max_reviews:
490
+ break
491
+
492
+ # Create unique identifier for this review
493
  try:
494
+ review_id = review_elem.get_attribute('data-review-id')
495
+ if not review_id:
496
+ # Fallback: use element position
497
+ review_id = f"pos_{idx}_{review_elem.location['y']}"
498
+ except:
499
+ review_id = f"idx_{idx}_{scroll_count}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
 
501
+ # Skip if already processed
502
+ if review_id in processed_ids:
503
  continue
504
+
505
+ # Extract review data
506
+ review_data = self._extract_review_data(review_elem, idx)
507
+
508
+ if review_data:
509
+ # Additional check: avoid duplicate text
510
+ if review_data['text'] not in review_texts:
511
+ names.append(review_data['name'])
512
+ dates.append(review_data['date'])
513
+ ratings.append(review_data['rating'])
514
+ review_texts.append(review_data['text'])
515
+ new_reviews_this_scroll += 1
516
+
517
+ processed_ids.add(review_id)
518
+
519
+ # Small delay between processing reviews
520
+ if idx % 5 == 0:
521
+ self._random_delay(0.1, 0.3)
522
 
523
+ # Check if we got new reviews
524
+ if new_reviews_this_scroll == 0:
525
+ no_new_reviews_count += 1
 
 
526
  else:
527
+ no_new_reviews_count = 0
528
 
529
+ # Check if we've reached the target
530
+ if max_reviews and len(review_texts) >= max_reviews:
531
+ self._log_progress(f"🎯 Reached target: {max_reviews} reviews", progress_callback)
532
+ break
533
+
534
+ # Scroll for more reviews
535
+ if scrollable:
536
+ self._scroll_reviews(scrollable)
537
+ else:
538
+ # Fallback: scroll the page
539
+ self.driver.execute_script("window.scrollBy(0, 500);")
540
+ time.sleep(1.5)
541
 
542
+ self._cleanup()
543
 
544
+ # Trim to max_reviews if needed
545
+ if max_reviews:
546
+ names = names[:max_reviews]
547
+ dates = dates[:max_reviews]
548
+ ratings = ratings[:max_reviews]
549
+ review_texts = review_texts[:max_reviews]
550
+
551
+ self._log_progress(
552
+ f"βœ… Scraped {len(review_texts)} reviews from Google Maps",
553
+ progress_callback
554
+ )
555
 
 
556
  return {
557
  'success': True,
558
+ 'total_reviews': len(review_texts),
559
  'total_pages': scroll_count,
560
+ 'reviews': {
561
  'names': names,
562
  'dates': dates,
563
  'overall_ratings': ratings,
564
+ # Google Maps doesn't have sub-ratings, fill with zeros
565
  'food_ratings': [0.0] * len(ratings),
566
  'service_ratings': [0.0] * len(ratings),
567
  'ambience_ratings': [0.0] * len(ratings),
568
+ 'review_texts': review_texts
569
  },
570
  'metadata': {
571
  'source': 'google_maps',
572
+ 'url': url,
573
  'scroll_count': scroll_count
574
  }
575
  }
576
 
 
 
 
 
577
  except Exception as e:
 
 
578
  self._cleanup()
579
+ import traceback
580
+ traceback.print_exc()
581
+ return {
582
+ 'success': False,
583
+ 'error': str(e),
584
+ 'reviews': {}
585
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
 
587
  def _validate_url(self, url: str) -> bool:
588
  """Validate Google Maps URL."""
589
+ if not url:
590
+ return False
591
  url_lower = url.lower()
592
+ return any(x in url_lower for x in [
593
+ 'google.com/maps',
594
+ 'goo.gl/maps',
595
+ 'maps.google',
596
+ 'maps.app.goo.gl'
597
+ ])
598
 
599
+ def _log_progress(self, message: str, callback: Optional[Callable]):
600
+ """Log progress with emoji indicators."""
601
  print(message)
602
  if callback:
603
  callback(message)
 
606
  self._cleanup()
607
 
608
 
609
+ def scrape_google_maps(
610
+ url: str,
611
+ max_reviews: Optional[int] = None,
612
+ headless: bool = True,
613
+ chromedriver_path: Optional[str] = None
614
+ ) -> Dict[str, Any]:
615
  """
616
  Scrape reviews from Google Maps.
617
+
618
+ Args:
619
+ url: Google Maps restaurant URL
620
+ max_reviews: Maximum number of reviews to scrape (None = all available)
621
+ headless: Run browser in headless mode
622
+ chromedriver_path: Optional path to chromedriver
623
+
624
+ Returns:
625
+ Dict with 'success', 'total_reviews', and 'reviews' data
626
+ (same format as OpenTable scraper for compatibility)
627
  """
628
+ scraper = GoogleMapsScraper(headless=headless, chromedriver_path=chromedriver_path)
629
+ return scraper.scrape_reviews(url, max_reviews=max_reviews)
630
 
631
 
632
  if __name__ == "__main__":
633
+ print("=" * 80)
634
+ print("πŸ—ΊοΈ Google Maps Review Scraper - Production Test (Nov 2025)")
635
+ print("=" * 80 + "\n")
636
+
637
+ # Test URL - Tutto Italian Restaurant & Bar
638
+ test_url = "https://www.google.com/maps/place/Tutto+Italian+Restaurant+%26+Bar"
639
 
640
+ print(f"🎯 Target: {test_url}")
641
+ print("πŸ“Š Limit: 20 reviews (test mode)")
642
+ print("πŸ€– Mode: HEADLESS\n")
643
+
644
+ result = scrape_google_maps(test_url, max_reviews=20, headless=True)
645
+
646
+ print("\n" + "=" * 80)
647
+ if result['success']:
648
+ print("βœ… SUCCESS!")
649
+ print(f" πŸ“Š Total reviews scraped: {result['total_reviews']}")
650
+ print(f" πŸ“œ Scroll iterations: {result.get('total_pages', 'N/A')}")
651
+
652
+ if result['total_reviews'] > 0:
653
+ print(f"\n πŸ” Sample (first review):")
654
+ print(f" πŸ‘€ Name: {result['reviews']['names'][0]}")
655
+ print(f" πŸ“… Date: {result['reviews']['dates'][0]}")
656
+ print(f" ⭐ Rating: {result['reviews']['overall_ratings'][0]}")
657
+ text = result['reviews']['review_texts'][0]
658
+ print(f" πŸ’¬ Review: {text[:150]}{'...' if len(text) > 150 else ''}")
659
+
660
+ print(f"\n πŸ“Š Rating distribution:")
661
+ ratings = result['reviews']['overall_ratings']
662
+ for star in range(5, 0, -1):
663
+ count = ratings.count(float(star))
664
+ print(f" {'⭐' * star}: {count} reviews")
665
+ else:
666
+ print("❌ FAILED")
667
+ print(f" Error: {result.get('error', 'Unknown error')}")
668
+ print("=" * 80)