TushP commited on
Commit
70bcde7
·
verified ·
1 Parent(s): 1ee12d1

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. src/scrapers/google_maps_scraper.py +25 -8
src/scrapers/google_maps_scraper.py CHANGED
@@ -45,10 +45,11 @@ class GoogleMapsScraper:
45
  "//div[@role='tablist']//button[contains(., 'Review')]",
46
  ],
47
 
48
- # Scrollable container - VERIFIED: div with role="feed"
49
  "scroll_container": [
50
- "//div[@role='feed']",
51
  "//div[contains(@class, 'm6QErb') and contains(@class, 'DxyBCb')]",
 
 
52
  "//div[contains(@class, 'm6QErb')][@tabindex='-1']",
53
  ],
54
 
@@ -59,10 +60,11 @@ class GoogleMapsScraper:
59
  "//div[contains(@class, 'jftiEf') and contains(@class, 'fontBodyMedium')]",
60
  ],
61
 
62
- # Review text - VERIFIED: span.wiI7pd
63
  "review_text": [
64
- ".//span[@class='wiI7pd']",
65
  ".//span[contains(@class, 'wiI7pd')]",
 
 
66
  ".//div[contains(@class, 'MyEned')]//span",
67
  ],
68
 
@@ -163,8 +165,21 @@ class GoogleMapsScraper:
163
  # Click "More" to expand if needed
164
  self._expand_review(card)
165
 
166
- # Extract text
167
- text = self._extract_text(card, self.SELECTORS["review_text"])
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  if text and len(text.strip()) > 10:
170
  collected_ids.add(card_id)
@@ -351,14 +366,16 @@ class GoogleMapsScraper:
351
  return 0.0
352
 
353
  def _scroll_down(self, container):
354
- """Scroll down in the container."""
355
  try:
356
  if container:
 
357
  self.driver.execute_script(
358
- "arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight * 0.8",
359
  container
360
  )
361
  else:
 
362
  ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()
363
  except:
364
  ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()
 
45
  "//div[@role='tablist']//button[contains(., 'Review')]",
46
  ],
47
 
48
+ # Scrollable container - from working version
49
  "scroll_container": [
 
50
  "//div[contains(@class, 'm6QErb') and contains(@class, 'DxyBCb')]",
51
+ "//div[contains(@class, 'XiKgde')]",
52
+ "//div[@role='feed']",
53
  "//div[contains(@class, 'm6QErb')][@tabindex='-1']",
54
  ],
55
 
 
60
  "//div[contains(@class, 'jftiEf') and contains(@class, 'fontBodyMedium')]",
61
  ],
62
 
63
+ # Review text - from working version with expanded text support
64
  "review_text": [
 
65
  ".//span[contains(@class, 'wiI7pd')]",
66
+ ".//span[@jsname='fbQN7e']", # Full expanded text
67
+ ".//span[@jsname='bN97Pc']", # Truncated text
68
  ".//div[contains(@class, 'MyEned')]//span",
69
  ],
70
 
 
165
  # Click "More" to expand if needed
166
  self._expand_review(card)
167
 
168
+ # Extract text - try multiple approaches like working version
169
+ text = ""
170
+ # First try expanded text selectors
171
+ for selector in [".//span[@jsname='fbQN7e']", ".//span[contains(@class, 'wiI7pd')]"]:
172
+ try:
173
+ elem = card.find_element(By.XPATH, selector)
174
+ t = elem.text.strip()
175
+ if t and len(t) > len(text):
176
+ text = t
177
+ except:
178
+ continue
179
+
180
+ # Fallback to general text selectors
181
+ if not text:
182
+ text = self._extract_text(card, self.SELECTORS["review_text"])
183
 
184
  if text and len(text.strip()) > 10:
185
  collected_ids.add(card_id)
 
366
  return 0.0
367
 
368
  def _scroll_down(self, container):
369
+ """Scroll down in the container - matches working version."""
370
  try:
371
  if container:
372
+ # Scroll to bottom (like working version)
373
  self.driver.execute_script(
374
+ "arguments[0].scrollTop = arguments[0].scrollHeight",
375
  container
376
  )
377
  else:
378
+ # Fallback: scroll the page
379
  ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()
380
  except:
381
  ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()